diff --git a/.gitignore b/.gitignore index b640999cc..a2ff166e5 100644 --- a/.gitignore +++ b/.gitignore @@ -38,3 +38,5 @@ resources/ velocity.log perf verify +maven-metadata-local.xml +dependency-reduced-pom.xml diff --git a/ant-bridge.sh b/ant-bridge.sh new file mode 100755 index 000000000..9f4713d7c --- /dev/null +++ b/ant-bridge.sh @@ -0,0 +1,173 @@ +#!/bin/sh + +mvn_args="verify" +mvn_properties= +mvn_clean= +unknown_args= +property_regex='-D(.*)=(.*)' +unit_test_regex='.*UnitTest' +post_script= +run_type="run" + +for arg in "${@}" ; do + if [[ "${arg}" == "dry" ]] ; then + run_type="dry" + + elif [[ "${arg}" == "clean" ]] ; then + mvn_clean="clean" + mvn_args= + + elif [[ "${arg}" =~ ${property_regex} ]] ; then + property_name=${BASH_REMATCH[1]} + property_value=${BASH_REMATCH[2]} + + if [[ "${property_name}" == "single" ]] ; then + test_property="test" + test_disabled="it.test" + if [[ ! "${property_value}" =~ ${unit_test_regex} ]] ; then + test_property="it.test" + test_disabled="test" + fi + + mvn_properties="${mvn_properties} -D${test_disabled}=disabled -D${test_property}=${property_value}" + + elif [[ "${property_name}" == "test.debug.port" ]] ; then + mvn_properties="${mvn_properties} -Dmaven.surefire.debug=\"-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=${property_value}\"" + mvn_properties="${mvn_properties} -Dmaven.failsafe.debug=\"-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=${property_value}\"" + + elif [[ "${property_name}" == "test.default.maxmemory" ]] ; then + mvn_properties="${mvn_properties} -Dtest.maxmemory=${property_value}" + + else + unknown_args="${unknown_args} \"${arg}\"" + + fi + + else + if [[ "${arg}" != "dist" && "${mvn_args}" != "" && "${mvn_args}" != "verify" ]] ; then + echo "Sorry, this script does not currently support mixing targets." >&2 + exit 1 + + elif [[ "${arg}" == "dist" ]] ; then + mvn_args="verify" + + elif [[ "${arg}" == "gatk" ]] ; then + mvn_args="verify '-P!queue'" + + elif [[ "${arg}" == "test.compile" ]] ; then + mvn_args="test-compile" + + elif [[ "${arg}" == "gatkdocs" ]] ; then + local_repo="sitetemprepo" + mvn_args="install -Dmaven.repo.local=${local_repo} -Ddisable.queue && mvn site -Dmaven.repo.local=${local_repo} -Ddisable.queue" + + elif [[ "${arg}" == "package.gatk.full" ]] ; then + mvn_args="package '-P!private,!queue'" + + elif [[ "${arg}" == "package.gatk.all" ]] ; then + mvn_args="package '-P!queue'" + + elif [[ "${arg}" == "package.queue.full" ]] ; then + mvn_args="package '-P!private'" + + elif [[ "${arg}" == "package.queue.all" ]] ; then + mvn_args="package" + +# elif [[ "${arg}" == "release.gatk.full" ]] ; then +# mvn_args="package '-P!private,!queue'" +# post_script=" && private/src/main/scripts/shell/copy_release.sh public/gatk-package/target/GenomeAnalysisTK-*.tar.bz2" + +# elif [[ "${arg}" == "release.queue.full" ]] ; then +# mvn_args="package '-P!private'" +# post_script=" && private/src/main/scripts/shell/copy_release.sh public/queue-package/target/Queue-*.tar.bz2" + + elif [[ "${arg}" == "build-picard-private" ]] ; then + mvn_args="mvn install -f private/picard-maven/pom.xml" + + # TODO: clover support + # see ant and maven docs for clover: + # https://confluence.atlassian.com/display/CLOVER/1.+QuickStart+Guide + # https://confluence.atlassian.com/display/CLOVER/Clover-for-Maven+2+and+3+User%27s+Guide + # + #elif [[ "${arg}" == "clover.report" ]] ; then + # mvn_args=... + # + #elif [[ "${arg}" == "with.clover" ]] ; then + # mvn_args=... + + # TODO: This runs *all* commit tests, including the few on Queue. + elif [[ "${arg}" == "gatkfull.binary.release.tests" ]] ; then + local_repo="sitetemprepo" + mvn_args="install -Dmaven.repo.local=${local_repo} && mvn verify" + mvn_args="${mvn_args} -Dmaven.repo.local=${local_repo}" + mvn_args="${mvn_args} -Dsting.packagetests.enabled=true" + mvn_args="${mvn_args} -Dsting.packagecommittests.skipped=false" + + # TODO: This runs only the pipeline tests (full, non-dry run), but not the commit tests for Queue. + elif [[ "${arg}" == "queuefull.binary.release.tests" ]] ; then + local_repo="sitetemprepo" + mvn_args="install -Dmaven.repo.local=${local_repo} && mvn verify" + mvn_args="${mvn_args} -Dmaven.repo.local=${local_repo}" + mvn_args="${mvn_args} -Dsting.packagetests.enabled=true" + mvn_args="${mvn_args} -Dsting.packagepipelinetests.skipped=false" + mvn_args="${mvn_args} -Dsting.pipelinetests.run=true" + + elif [[ "${arg}" == "committests" ]] ; then + mvn_args="verify -Dsting.committests.skipped=false" + + elif [[ "${arg}" == "test" ]] ; then + mvn_args="test -Dsting.unittests.skipped=false" + + elif [[ "${arg}" == "unittest" ]] ; then + mvn_args="test -Dsting.unittests.skipped=false" + + elif [[ "${arg}" == "integrationtest" ]] ; then + mvn_args="verify -Dsting.integrationtests.skipped=false" + + elif [[ "${arg}" == "largescaletest" ]] ; then + mvn_args="verify -Dsting.largescaletests.skipped=false" + + elif [[ "${arg}" == "knowledgebasetest" ]] ; then + mvn_args="verify -Dsting.knowledgebasetests.skipped=false" + + elif [[ "${arg}" == "pipelinetest" ]] ; then + mvn_args="verify -Dsting.pipelinetests.skipped=false" + + elif [[ "${arg}" == "pipelinetestrun" ]] ; then + mvn_args="verify -Dsting.pipelinetests.skipped=false -Dsting.pipelinetests.run=true" + + elif [[ "${arg}" == "fasttest" ]] ; then + mvn_args="verify -Dsting.committests.skipped=false -pl private/gatk-private -am -Dresource.bundle.skip=true" + + else + unknown_args="${unknown_args} \"${arg}\"" + + fi + + fi + +done + +mvn_cmd= +if [[ "${mvn_clean}" != "" ]] ; then + if [[ "${mvn_args}" != "" ]] ; then + mvn_cmd="mvn ${mvn_clean} && mvn ${mvn_args}" + else + mvn_cmd="mvn ${mvn_clean}" + fi +else + mvn_cmd="mvn ${mvn_args}" +fi + +if [[ "${unknown_args}" != "" ]] ; then + echo "Unrecognized arguments:${unknown_args}" >&2 + +else + echo "Equivalent maven command" + echo "${mvn_cmd}${mvn_properties}${post_script}" + + if [[ "${run_type}" != "dry" ]] ; then + sh -c "${mvn_cmd}${mvn_properties}${post_script}" + fi + +fi diff --git a/build.xml b/build.xml deleted file mode 100644 index 2493553fc..000000000 --- a/build.xml +++ /dev/null @@ -1,1533 +0,0 @@ - - - - - Compile and distribute the Sting toolkit - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Generating Queue GATK extensions... - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Building Scala... - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/ivy.xml b/ivy.xml deleted file mode 100644 index 2e45247ab..000000000 --- a/ivy.xml +++ /dev/null @@ -1,117 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/pom.xml b/pom.xml new file mode 100644 index 000000000..3abf73151 --- /dev/null +++ b/pom.xml @@ -0,0 +1,862 @@ + + + 4.0.0 + + + + + org.broadinstitute.sting + sting-root + 2.8-SNAPSHOT + public/sting-root + + + sting-aggregator + pom + Sting Aggregator + + + public + + + + + ${project.basedir} + StingText.properties + false + + -build-timestamp "${maven.build.timestamp}" + + + package + generate-resources + process-resources + process-test-resources + + + true + ${sting.packagecommittests.skipped} + ${sting.packagecommittests.skipped} + ${sting.packagecommittests.skipped} + true + true + + + true + ${sting.serialcommittests.skipped} + ${sting.serialcommittests.skipped} + ${sting.serialcommittests.skipped} + true + true + + + + + com.sun + tools + + + + + + + + + org.apache.maven.plugins + maven-clean-plugin + + + + + gatkdocs + + + ${basedir} + + javadoc.sh + options + packages + + + + + ${basedir} + + dependency-reduced-pom.xml + + + + + + + org.apache.maven.plugins + maven-dependency-plugin + + + unpack-direct-dependencies + + unpack-dependencies + + none + + true + ${project.build.outputDirectory} + jar + system + + + + + + + org.apache.maven.plugins + maven-resources-plugin + + + default-resources + + resources + + ${sting.process-resources.phase} + + + default-testResources + + testResources + + ${sting.process-test-resources.phase} + + + copy-resource-bundle-log4j + + copy-resources + + none + + ${project.reporting.outputDirectory}/apidocs + + + ${sting.basedir}/sting-utils/src/main/config/org/broadinstitute/sting/utils/help + + + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + + + extract-resource-bundle + + javadoc + + none + + + ${resource.bundle.skip} + org.broadinstitute.sting.utils.help.ResourceBundleExtractorDoclet + + ${project.build.outputDirectory} + + ${project.groupId} + + gatk-framework + ${project.version} + + 2g + false + true + -build-timestamp "${maven.build.timestamp}" -absolute-version ${build.version} -out ${project.build.outputDirectory}/${resource.bundle.path} + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + + none + + com.google.java.contract.core.apt.AnnotationProcessor + + + + + default-compile + none + + + default-testCompile + none + + + + compile-package-info + + compile + + compile + + + -Xpkginfo:always + + + **/package-info.java + + + + + + compile-java + + compile + + compile + + + + **/package-info.java + + + + + + testCompile-java + + testCompile + + test-compile + + + + + + org.scala-tools + maven-scala-plugin + + + + compile + testCompile + + + + + + + org.apache.maven.plugins + maven-jar-plugin + + + default-jar + ${sting.jar.phase} + + + test-jar + + test-jar + + ${sting.jar.phase} + + true + + + + + + + org.apache.maven.plugins + maven-shade-plugin + + + sting-executable + + shade + + none + + true + + + org.broadinstitute.sting:gsalib:tar.gz:* + org.broadinstitute.sting:*:tar.bz2:example-resources + + + + + + ${app.main.class} + + + + ${resource.bundle.path} + + + + + + + + + org.apache.maven.plugins + maven-assembly-plugin + + + example-resources + + single + + none + + + src/main/assembly/example-resources.xml + + + + + binary-dist + + single + + none + + + src/main/assembly/binary-dist.xml + + + + + + + + + com.pyx4j + maven-junction-plugin + + + link-public-testdata + + link + + none + + + + ${basedir}/public/testdata + ${sting.basedir}/public/gatk-framework/src/test/resources + + + + + + unlink-public-testdata + + unlink + + none + + + + ${basedir}/public/testdata + ${sting.basedir}/public/gatk-framework/src/test/resources + + + + + + link-private-testdata + + link + + none + + + + ${basedir}/private/testdata + ${sting.basedir}/private/gatk-private/src/test/resources + + + + + + unlink-private-testdata + + unlink + + none + + + + ${basedir}/private/testdata + ${sting.basedir}/private/gatk-private/src/test/resources + + + + + + link-public-qscript + + link + + none + + + + ${basedir}/public/scala/qscript + ${sting.basedir}/public/queue-framework/src/main/qscripts + + + + + + unlink-public-qscript + + unlink + + none + + + + ${basedir}/public/scala/qscript + ${sting.basedir}/public/queue-framework/src/main/qscripts + + + + + + link-private-qscript + + link + + none + + + + ${basedir}/private/scala/qscript + ${sting.basedir}/private/queue-private/src/main/qscripts + + + + + + unlink-private-qscript + + unlink + + none + + + + ${basedir}/private/scala/qscript + ${sting.basedir}/private/queue-private/src/main/qscripts + + + + + + link-binary-jar + + link + + none + + + + ${sting.basedir}/target/${sting.binary-dist.name}.${project.packaging} + ${project.build.directory}/${project.build.finalName}.${project.packaging} + + + + + + link-git-release + + link + + none + + + + ${project.build.directory}/${sting.binary-dist.name}-${build.version}.tar.bz2 + ${project.build.directory}/${project.build.finalName}-binary-dist.tar.bz2 + + + + + + + + org.apache.maven.plugins + maven-invoker-plugin + + true + false + ${sting.basedir}/public/package-tests/pom.xml + true + true + ${sting.basedir}/${maven.repo.local} + + ${test} + ${it.test} + false + false + ${sting.packagetests.artifactId} + ${project.build.testOutputDirectory} + ${project.basedir} + ${sting.pipelinetests.run} + ${maven.surefire.debug} + ${maven.failsafe.debug} + + + + + + package-unittests + + run + + + + test + + ${project.build.directory}/invoker-reports/unit/${test} + ${sting.packageunittests.skipped} + + true + ${sting.packageunittests.skipped} + + + + + package-integrationtests + + integration-test + verify + + + + verify + + ${project.build.directory}/invoker-reports/integration/${it.test} + ${sting.packageintegrationtests.skipped} + + true + ${sting.packageintegrationtests.skipped} + ${project.build.directory}/failsafe-reports/integration/failsafe-summary-${it.test}.xml + + + + + package-pipelinetests + + integration-test + verify + + + + verify + + ${project.build.directory}/invoker-reports/pipeline/${it.test} + ${sting.packagepipelinetests.skipped} + + true + ${sting.packagepipelinetests.skipped} + ${project.build.directory}/failsafe-reports/pipeline/failsafe-summary-${it.test}.xml + + + + + package-largescaletests + + integration-test + verify + + + + verify + + ${project.build.directory}/invoker-reports/largescale/${it.test} + ${sting.packagelargescaletests.skipped} + + true + ${sting.packagelargescaletests.skipped} + ${project.build.directory}/failsafe-reports/largescale/failsafe-summary-${it.test}.xml + + + + + package-knowledgebasetests + + integration-test + verify + + + + verify + + ${project.build.directory}/invoker-reports/knowledgebase/${it.test} + ${sting.packageknowledgebasetests.skipped} + + true + ${sting.packageknowledgebasetests.skipped} + ${project.build.directory}/failsafe-reports/knowledgebase/failsafe-summary-${it.test}.xml + + + + + + + org.apache.maven.plugins + maven-install-plugin + 2.5 + + + install-package + + install-file + + none + + true + ${project.groupId} + ${project.artifactId} + ${project.version} + ${project.packaging} + ${project.build.directory}/${project.build.finalName}.${project.packaging} + + + + + + + + + + + org.apache.maven.plugins + maven-failsafe-plugin + + + com.pyx4j + maven-junction-plugin + + + link-public-testdata + process-test-resources + + + unlink-public-testdata + clean + + + link-public-qscript + process-test-resources + + + unlink-public-qscript + clean + + + + + org.apache.maven.plugins + maven-clean-plugin + + + com.google.code.sortpom + maven-sortpom-plugin + + + package-tests + + sort + + verify + false + + public/package-tests/pom.xml + + + + + + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + 2.9.1 + + + + + + + generate-gatk-docs + + aggregate + + + false + + org.broadinstitute.sting.utils.help.GATKDoclet + + ${project.groupId} + gatk-package + ${project.version} + + false + true + private + -build-timestamp "${maven.build.timestamp}" -absolute-version ${build.version} ${gatkdocs.include.hidden} -settings-dir ${sting.basedir}/settings/helpTemplates -destination-dir ${project.build.directory}/gatkdocs + + + + + + + + + + + protected + + + ${basedir}/protected/pom.xml + + + + protected + + + + + + private + + + ${basedir}/private/pom.xml + + + + private + + + + + + com.pyx4j + maven-junction-plugin + + + link-private-testdata + process-test-resources + + + unlink-private-testdata + clean + + + link-private-qscript + process-test-resources + + + unlink-private-qscript + clean + + + + + + + + + + packagetests-enabled + + + sting.packagetests.enabled + true + + + + true + true + none + none + none + none + + + + + diff --git a/protected/gatk-protected/pom.xml b/protected/gatk-protected/pom.xml new file mode 100644 index 000000000..d75c5b056 --- /dev/null +++ b/protected/gatk-protected/pom.xml @@ -0,0 +1,139 @@ + + + 4.0.0 + + + org.broadinstitute.sting + sting-aggregator + 2.8-SNAPSHOT + ../.. + + + gatk-protected + jar + GATK Protected + + + ${project.basedir}/../.. + gatk-package + + + + + ${project.groupId} + gatk-framework + ${project.version} + + + + net.sf.jgrapht + jgrapht + + + + gov.nist.math + jama + + + + it.unimi.dsi + fastutil + + + + ${project.groupId} + gatk-framework + ${project.version} + test-jar + test + + + + org.testng + testng + test + + + com.google.caliper + caliper + test + + + + + + + org.apache.maven.plugins + maven-resources-plugin + + + copy-resource-bundle-log4j + prepare-package + + + + + org.apache.maven.plugins + maven-javadoc-plugin + + + extract-resource-bundle + prepare-package + + + + + org.apache.maven.plugins + maven-invoker-plugin + + + package-unittests + + + package-integrationtests + + + package-largescaletests + + + package-knowledgebasetests + + + package-pipelinetests + + + + + + + + + private + + + ${basedir}/../../private/gatk-private/pom.xml + + + + + + + com.pyx4j + maven-junction-plugin + + + link-private-testdata + process-test-resources + + + unlink-private-testdata + clean + + + + + + + + + diff --git a/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/Coverage.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/Coverage.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/Coverage.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/Coverage.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerSampleHC.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerSampleHC.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerSampleHC.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerSampleHC.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java new file mode 100644 index 000000000..a04815e62 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -0,0 +1,517 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.annotator; + +import cern.jet.math.Arithmetic; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.variant.variantcontext.Genotype; +import org.broadinstitute.variant.variantcontext.GenotypesContext; +import org.broadinstitute.variant.vcf.VCFHeaderLineType; +import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.VariantContext; + +import java.util.*; + + +/** + * Phred-scaled p-value using Fisher's Exact Test to detect strand bias + * + *

Phred-scaled p-value using Fisher's Exact Test to detect strand bias (the variation + * being seen on only the forward or only the reverse strand) in the reads. More bias is + * indicative of false positive calls. + *

+ * + *

Caveat

+ *

The Fisher Strand test may not be calculated for certain complex indel cases or for multi-allelic sites.

+ */ +public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { + private final static boolean ENABLE_DEBUGGING = false; + private final static Logger logger = Logger.getLogger(FisherStrand.class); + + private static final String FS = "FS"; + private static final double MIN_PVALUE = 1E-320; + private static final int MIN_QUAL_FOR_FILTERED_TEST = 17; + private static final int MIN_COUNT = 2; + + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { + if ( !vc.isVariant() ) + return null; + + if ( vc.hasGenotypes() ) { + final int[][] tableFromPerSampleAnnotations = getTableFromSamples( vc.getGenotypes() ); + if ( tableFromPerSampleAnnotations != null ) { + return pValueForBestTable(tableFromPerSampleAnnotations, null); + } + } + + if (vc.isSNP() && stratifiedContexts != null) { + final int[][] tableNoFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), -1); + final int[][] tableFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), MIN_QUAL_FOR_FILTERED_TEST); + printTable("unfiltered", tableNoFiltering); + printTable("filtered", tableFiltering); + return pValueForBestTable(tableFiltering, tableNoFiltering); + } + else if (stratifiedPerReadAlleleLikelihoodMap != null) { + // either SNP with no alignment context, or indels: per-read likelihood map needed + final int[][] table = getContingencyTable(stratifiedPerReadAlleleLikelihoodMap, vc); +// logger.info("VC " + vc); +// printTable(table, 0.0); + return pValueForBestTable(table, null); + } + else + // for non-snp variants, we need per-read likelihoods. + // for snps, we can get same result from simple pileup + return null; + } + + /** + * Create the FisherStrand table by retrieving the per-sample strand bias annotation and adding them together + * @param genotypes the genotypes from which to pull out the per-sample strand bias annotation + * @return the table used for the FisherStrand p-value calculation, will be null if none of the genotypes contain the per-sample SB annotation + */ + private int[][] getTableFromSamples( final GenotypesContext genotypes ) { + if( genotypes == null ) { throw new IllegalArgumentException("Genotypes cannot be null."); } + + final int[] sbArray = {0,0,0,0}; // reference-forward-reverse -by- alternate-forward-reverse + boolean foundData = false; + + for( final Genotype g : genotypes ) { + if( g.isNoCall() || ! g.hasAnyAttribute(StrandBiasBySample.STRAND_BIAS_BY_SAMPLE_KEY_NAME) ) + continue; + + foundData = true; + final String sbbsString = (String) g.getAnyAttribute(StrandBiasBySample.STRAND_BIAS_BY_SAMPLE_KEY_NAME); + final int[] data = encodeSBBS(sbbsString); + if ( passesMinimumThreshold(data) ) { + for( int index = 0; index < sbArray.length; index++ ) { + sbArray[index] += data[index]; + } + } + } + + return ( foundData ? decodeSBBS(sbArray) : null ); + } + + /** + * Does this strand data array pass the minimum threshold for inclusion? + * + * @param data the array + * @return true if it passes the minimum threshold, false otherwise + */ + private static boolean passesMinimumThreshold(final int[] data) { + // the ref and alt totals must each be greater than MIN_COUNT + return data[0] + data[1] > MIN_COUNT && data[2] + data[3] > MIN_COUNT; + } + + /** + * Create an annotation for the highest (i.e., least significant) p-value of table1 and table2 + * + * @param table1 a contingency table, may be null + * @param table2 a contingency table, may be null + * @return annotation result for FS given tables + */ + private Map pValueForBestTable(final int[][] table1, final int[][] table2) { + if ( table2 == null ) + return table1 == null ? null : annotationForOneTable(pValueForContingencyTable(table1)); + else if (table1 == null) + return annotationForOneTable(pValueForContingencyTable(table2)); + else { // take the one with the best (i.e., least significant pvalue) + double pvalue1 = pValueForContingencyTable(table1); + double pvalue2 = pValueForContingencyTable(table2); + return annotationForOneTable(Math.max(pvalue1, pvalue2)); + } + } + + /** + * Returns an annotation result given a pValue + * + * @param pValue + * @return a hash map from FS -> phred-scaled pValue + */ + private Map annotationForOneTable(final double pValue) { + final Object value = String.format("%.3f", QualityUtils.phredScaleErrorRate(Math.max(pValue, MIN_PVALUE))); // prevent INFINITYs + return Collections.singletonMap(FS, value); + } + + public List getKeyNames() { + return Collections.singletonList(FS); + } + + public List getDescriptions() { + return Collections.singletonList(new VCFInfoHeaderLine(FS, 1, VCFHeaderLineType.Float, "Phred-scaled p-value using Fisher's exact test to detect strand bias")); + } + + /** + * Helper function to turn the FisherStrand table into the SB annotation array + * @param table the table used by the FisherStrand annotation + * @return the array used by the per-sample Strand Bias annotation + */ + public static List getContingencyArray( final int[][] table ) { + if(table.length != 2) { throw new IllegalArgumentException("Expecting a 2x2 strand bias table."); } + if(table[0].length != 2) { throw new IllegalArgumentException("Expecting a 2x2 strand bias table."); } + final List list = new ArrayList<>(4); // TODO - if we ever want to do something clever with multi-allelic sites this will need to change + list.add(table[0][0]); + list.add(table[0][1]); + list.add(table[1][0]); + list.add(table[1][1]); + return list; + } + + /** + * Helper function to parse the genotype annotation into the SB annotation array + * @param string the string that is returned by genotype.getAnnotation("SB") + * @return the array used by the per-sample Strand Bias annotation + */ + private static int[] encodeSBBS( final String string ) { + final int[] array = new int[4]; + final StringTokenizer tokenizer = new StringTokenizer(string, ",", false); + for( int index = 0; index < 4; index++ ) { + array[index] = Integer.parseInt(tokenizer.nextToken()); + } + return array; + } + + /** + * Helper function to turn the SB annotation array into the FisherStrand table + * @param array the array used by the per-sample Strand Bias annotation + * @return the table used by the FisherStrand annotation + */ + private static int[][] decodeSBBS( final int[] array ) { + if(array.length != 4) { throw new IllegalArgumentException("Expecting a length = 4 strand bias array."); } + final int[][] table = new int[2][2]; + table[0][0] = array[0]; + table[0][1] = array[1]; + table[1][0] = array[2]; + table[1][1] = array[3]; + return table; + } + + private Double pValueForContingencyTable(int[][] originalTable) { + final int[][] normalizedTable = normalizeContingencyTable(originalTable); + + int[][] table = copyContingencyTable(normalizedTable); + + double pCutoff = computePValue(table); + //printTable(table, pCutoff); + + double pValue = pCutoff; + while (rotateTable(table)) { + double pValuePiece = computePValue(table); + + //printTable(table, pValuePiece); + + if (pValuePiece <= pCutoff) { + pValue += pValuePiece; + } + } + + table = copyContingencyTable(normalizedTable); + while (unrotateTable(table)) { + double pValuePiece = computePValue(table); + + //printTable(table, pValuePiece); + + if (pValuePiece <= pCutoff) { + pValue += pValuePiece; + } + } + + //System.out.printf("P-cutoff: %f\n", pCutoff); + //System.out.printf("P-value: %f\n\n", pValue); + + // min is necessary as numerical precision can result in pValue being slightly greater than 1.0 + return Math.min(pValue, 1.0); + } + + // how large do we want the normalized table to be? + private static final double TARGET_TABLE_SIZE = 200.0; + + /** + * Normalize the table so that the entries are not too large. + * Note that this method does NOT necessarily make a copy of the table being passed in! + * + * @param table the original table + * @return a normalized version of the table or the original table if it is already normalized + */ + private static int[][] normalizeContingencyTable(final int[][] table) { + final int sum = table[0][0] + table[0][1] + table[1][0] + table[1][1]; + if ( sum <= TARGET_TABLE_SIZE * 2 ) + return table; + + final double normalizationFactor = (double)sum / TARGET_TABLE_SIZE; + + final int[][] normalized = new int[2][2]; + for ( int i = 0; i < 2; i++ ) { + for ( int j = 0; j < 2; j++ ) + normalized[i][j] = (int)(table[i][j] / normalizationFactor); + } + + return normalized; + } + + private static int [][] copyContingencyTable(int [][] t) { + int[][] c = new int[2][2]; + + for ( int i = 0; i < 2; i++ ) + for ( int j = 0; j < 2; j++ ) + c[i][j] = t[i][j]; + + return c; + } + + + private static void printTable(int[][] table, double pValue) { + logger.info(String.format("%d %d; %d %d : %f", table[0][0], table[0][1], table[1][0], table[1][1], pValue)); + } + + /** + * Printing information to logger.info for debugging purposes + * + * @param name the name of the table + * @param table the table itself + */ + private void printTable(final String name, final int[][] table) { + if ( ENABLE_DEBUGGING ) { + final String pValue = (String)annotationForOneTable(pValueForContingencyTable(table)).get(FS); + logger.info(String.format("FS %s (REF+, REF-, ALT+, ALT-) = (%d, %d, %d, %d) = %s", + name, table[0][0], table[0][1], table[1][0], table[1][1], pValue)); + } + } + + private static boolean rotateTable(int[][] table) { + table[0][0] -= 1; + table[1][0] += 1; + + table[0][1] += 1; + table[1][1] -= 1; + + return (table[0][0] >= 0 && table[1][1] >= 0); + } + + private static boolean unrotateTable(int[][] table) { + table[0][0] += 1; + table[1][0] -= 1; + + table[0][1] -= 1; + table[1][1] += 1; + + return (table[0][1] >= 0 && table[1][0] >= 0); + } + + private static double computePValue(int[][] table) { + + int[] rowSums = { sumRow(table, 0), sumRow(table, 1) }; + int[] colSums = { sumColumn(table, 0), sumColumn(table, 1) }; + int N = rowSums[0] + rowSums[1]; + + // calculate in log space so we don't die with high numbers + double pCutoff = Arithmetic.logFactorial(rowSums[0]) + + Arithmetic.logFactorial(rowSums[1]) + + Arithmetic.logFactorial(colSums[0]) + + Arithmetic.logFactorial(colSums[1]) + - Arithmetic.logFactorial(table[0][0]) + - Arithmetic.logFactorial(table[0][1]) + - Arithmetic.logFactorial(table[1][0]) + - Arithmetic.logFactorial(table[1][1]) + - Arithmetic.logFactorial(N); + return Math.exp(pCutoff); + } + + private static int sumRow(int[][] table, int column) { + int sum = 0; + for (int r = 0; r < table.length; r++) { + sum += table[r][column]; + } + + return sum; + } + + private static int sumColumn(int[][] table, int row) { + int sum = 0; + for (int c = 0; c < table[row].length; c++) { + sum += table[row][c]; + } + + return sum; + } + + /** + Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this: + * fw rc + * allele1 # # + * allele2 # # + * @return a 2x2 contingency table + */ + public static int[][] getContingencyTable( final Map stratifiedPerReadAlleleLikelihoodMap, final VariantContext vc) { + if( stratifiedPerReadAlleleLikelihoodMap == null ) { throw new IllegalArgumentException("stratifiedPerReadAlleleLikelihoodMap cannot be null"); } + if( vc == null ) { throw new IllegalArgumentException("input vc cannot be null"); } + + final Allele ref = vc.getReference(); + final Allele alt = vc.getAltAlleleWithHighestAlleleCount(); + final int[][] table = new int[2][2]; + + for (final PerReadAlleleLikelihoodMap maps : stratifiedPerReadAlleleLikelihoodMap.values() ) { + final int[] myTable = new int[4]; + for (final Map.Entry> el : maps.getLikelihoodReadMap().entrySet()) { + final MostLikelyAllele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); + final GATKSAMRecord read = el.getKey(); + final int representativeCount = read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1; + updateTable(myTable, mostLikelyAllele.getAlleleIfInformative(), read, ref, alt, representativeCount); + } + if ( passesMinimumThreshold(myTable) ) + copyToMainTable(myTable, table); + } + + return table; + } + + /** + * Helper method to copy the per-sample table to the main table + * + * @param perSampleTable per-sample table (single dimension) + * @param mainTable main table (two dimensions) + */ + private static void copyToMainTable(final int[] perSampleTable, final int[][] mainTable) { + mainTable[0][0] += perSampleTable[0]; + mainTable[0][1] += perSampleTable[1]; + mainTable[1][0] += perSampleTable[2]; + mainTable[1][1] += perSampleTable[3]; + } + + /** + Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this: + * fw rc + * allele1 # # + * allele2 # # + * @return a 2x2 contingency table + */ + private static int[][] getSNPContingencyTable(final Map stratifiedContexts, + final Allele ref, + final Allele alt, + final int minQScoreToConsider ) { + int[][] table = new int[2][2]; + + for ( Map.Entry sample : stratifiedContexts.entrySet() ) { + final int[] myTable = new int[4]; + for (PileupElement p : sample.getValue().getBasePileup()) { + + if ( ! isUsableBase(p) ) // ignore deletions and bad MQ + continue; + + if ( p.getQual() < minQScoreToConsider || p.getMappingQual() < minQScoreToConsider ) + continue; + + updateTable(myTable, Allele.create(p.getBase(), false), p.getRead(), ref, alt, p.getRepresentativeCount()); + } + if ( passesMinimumThreshold(myTable) ) + copyToMainTable(myTable, table); + } + + return table; + } + + /** + * Can the base in this pileup element be used in comparative tests? + * + * @param p the pileup element to consider + * + * @return true if this base is part of a meaningful read for comparison, false otherwise + */ + private static boolean isUsableBase(final PileupElement p) { + return !( p.isDeletion() || + p.getMappingQual() == 0 || + p.getMappingQual() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE || + ((int) p.getQual()) < QualityUtils.MIN_USABLE_Q_SCORE); + } + + private static void updateTable(final int[] table, final Allele allele, final GATKSAMRecord read, final Allele ref, final Allele alt, final int representativeCount) { + + final boolean matchesRef = allele.equals(ref, true); + final boolean matchesAlt = allele.equals(alt, true); + + if ( matchesRef || matchesAlt ) { + final int offset = matchesRef ? 0 : 2; + + if ( read.isStrandless() ) { + + // ignore strandless reduced reads because they are always on the forward strand! + if ( !read.isReducedRead() ) { + + // a strandless read counts as observations on both strand, at 50% weight, with a minimum of 1 + // (the 1 is to ensure that a strandless read always counts as an observation on both strands, even + // if the read is only seen once, because it's a merged read or other) + final int toAdd = Math.max(representativeCount / 2, 1); + table[offset] += toAdd; + table[offset + 1] += toAdd; + } + } else { + // a normal read with an actual strand + final boolean isFW = !read.getReadNegativeStrandFlag(); + table[offset + (isFW ? 0 : 1)] += representativeCount; + } + } + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LikelihoodRankSumTest.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/LikelihoodRankSumTest.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LikelihoodRankSumTest.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/LikelihoodRankSumTest.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java new file mode 100644 index 000000000..7ebbd49dd --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java @@ -0,0 +1,191 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.vcf.VCFHeaderLineType; +import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; +import org.broadinstitute.variant.variantcontext.Genotype; +import org.broadinstitute.variant.variantcontext.GenotypesContext; +import org.broadinstitute.variant.variantcontext.VariantContext; + +import java.util.*; + +/** + * Variant confidence (from the QUAL field) / unfiltered depth of non-reference samples. Note that the QD is also normalized by event length. + * + * Low scores are indicative of false positive calls and artifacts. Note that QualByDepth requires sequencing + * reads associated with the samples with polymorphic genotypes. + */ +public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { +// private final static Logger logger = Logger.getLogger(QualByDepth.class); + + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map perReadAlleleLikelihoodMap ) { + if ( !vc.hasLog10PError() ) + return null; + + final GenotypesContext genotypes = vc.getGenotypes(); + if ( genotypes == null || genotypes.size() == 0 ) + return null; + + int standardDepth = 0; + int ADrestrictedDepth = 0; + + for ( final Genotype genotype : genotypes ) { + + // we care only about variant calls with likelihoods + if ( !genotype.isHet() && !genotype.isHomVar() ) + continue; + + // if we have the AD values for this sample, let's make sure that the variant depth is greater than 1! + // TODO -- If we like how this is working and want to apply it to a situation other than the single sample HC pipeline, + // TODO -- then we will need to modify the annotateContext() - and related - routines in the VariantAnnotatorEngine + // TODO -- so that genotype-level annotations are run first (to generate AD on the samples) and then the site-level + // TODO -- annotations must come afterwards (so that QD can use the AD). + if ( genotype.hasAD() ) { + final int[] AD = genotype.getAD(); + final int totalADdepth = (int)MathUtils.sum(AD); + if ( totalADdepth - AD[0] > 1 ) + ADrestrictedDepth += totalADdepth; + standardDepth += totalADdepth; + continue; + } + + if (stratifiedContexts!= null && !stratifiedContexts.isEmpty()) { + final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); + if ( context == null ) + continue; + standardDepth += context.getBasePileup().depthOfCoverage(); + + } else if (perReadAlleleLikelihoodMap != null) { + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoods = perReadAlleleLikelihoodMap.get(genotype.getSampleName()); + if (perReadAlleleLikelihoods == null || perReadAlleleLikelihoods.isEmpty()) + continue; + + standardDepth += perReadAlleleLikelihoods.getNumberOfStoredElements(); + } else if ( genotype.hasDP() ) { + standardDepth += genotype.getDP(); + } + } + + // if the AD-restricted depth is a usable value (i.e. not zero), then we should use that one going forward + if ( ADrestrictedDepth > 0 ) + standardDepth = ADrestrictedDepth; + + if ( standardDepth == 0 ) + return null; + + final double altAlleleLength = GATKVariantContextUtils.getMeanAltAlleleLength(vc); + // Hack: when refContext == null then we know we are coming from the HaplotypeCaller and do not want to do a + // full length-based normalization (because the indel length problem is present only in the UnifiedGenotyper) + double QD = -10.0 * vc.getLog10PError() / ((double)standardDepth * indelNormalizationFactor(altAlleleLength, ref != null)); + + // Hack: see note in the fixTooHighQD method below + QD = fixTooHighQD(QD); + + final Map map = new HashMap<>(); + map.put(getKeyNames().get(0), String.format("%.2f", QD)); + return map; + } + + /** + * Generate the indel normalization factor. + * + * @param altAlleleLength the average alternate allele length for the call + * @param increaseNormalizationAsLengthIncreases should we apply a normalization factor based on the allele length? + * @return a possitive double + */ + private double indelNormalizationFactor(final double altAlleleLength, final boolean increaseNormalizationAsLengthIncreases) { + return ( increaseNormalizationAsLengthIncreases ? Math.max(altAlleleLength / 3.0, 1.0) : 1.0); + } + + /** + * The haplotype caller generates very high quality scores when multiple events are on the + * same haplotype. This causes some very good variants to have unusually high QD values, + * and VQSR will filter these out. This code looks at the QD value, and if it is above + * threshold we map it down to the mean high QD value, with some jittering + * + * // TODO -- remove me when HaplotypeCaller bubble caller is live + * + * @param QD the raw QD score + * @return a QD value + */ + private double fixTooHighQD(final double QD) { + if ( QD < MAX_QD_BEFORE_FIXING ) { + return QD; + } else { + return IDEAL_HIGH_QD + GenomeAnalysisEngine.getRandomGenerator().nextGaussian() * JITTER_SIGMA; + } + } + + private final static double MAX_QD_BEFORE_FIXING = 35; + private final static double IDEAL_HIGH_QD = 30; + private final static double JITTER_SIGMA = 3; + + public List getKeyNames() { return Arrays.asList("QD"); } + + public List getDescriptions() { + return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Float, "Variant Confidence/Quality by Depth")); + } + + +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/StrandBiasBySample.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/StrandBiasBySample.java new file mode 100644 index 000000000..ec1c1e729 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/StrandBiasBySample.java @@ -0,0 +1,99 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.variant.variantcontext.Genotype; +import org.broadinstitute.variant.variantcontext.GenotypeBuilder; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.vcf.VCFFormatHeaderLine; +import org.broadinstitute.variant.vcf.VCFHeaderLineType; + +import java.util.*; + +/** + * Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias + * User: rpoplin + * Date: 8/28/13 + */ + +public class StrandBiasBySample extends GenotypeAnnotation { + + public final static String STRAND_BIAS_BY_SAMPLE_KEY_NAME = "SB"; + + @Override + public void annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final AlignmentContext stratifiedContext, + final VariantContext vc, + final Genotype g, + final GenotypeBuilder gb, + final PerReadAlleleLikelihoodMap alleleLikelihoodMap) { + if ( ! isAppropriateInput(alleleLikelihoodMap, g) ) + return; + + final int[][] table = FisherStrand.getContingencyTable(Collections.singletonMap(g.getSampleName(), alleleLikelihoodMap), vc); + + gb.attribute(STRAND_BIAS_BY_SAMPLE_KEY_NAME, FisherStrand.getContingencyArray(table)); + } + + @Override + public List getKeyNames() { return Collections.singletonList(STRAND_BIAS_BY_SAMPLE_KEY_NAME); } + + @Override + public List getDescriptions() { return Collections.singletonList(new VCFFormatHeaderLine(getKeyNames().get(0), 4, VCFHeaderLineType.Integer, "Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.")); } + + private boolean isAppropriateInput(final PerReadAlleleLikelihoodMap map, final Genotype g) { + return ! (map == null || g == null || !g.isCalled()); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantType.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/VariantType.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantType.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/VariantType.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AnalyzeCovariates.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/AnalyzeCovariates.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AnalyzeCovariates.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/AnalyzeCovariates.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseIndex.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseIndex.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseIndex.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseIndex.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/Compressor.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/Compressor.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/Compressor.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/Compressor.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/FinishedGenomeLoc.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/FinishedGenomeLoc.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/FinishedGenomeLoc.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/FinishedGenomeLoc.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsStash.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsStash.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsStash.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsStash.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/BaseCoverageDistribution.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/BaseCoverageDistribution.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/BaseCoverageDistribution.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/BaseCoverageDistribution.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/FindCoveredIntervals.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/FindCoveredIntervals.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/FindCoveredIntervals.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/FindCoveredIntervals.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/AbstractStratification.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/AbstractStratification.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/AbstractStratification.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/AbstractStratification.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/CallableStatus.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/CallableStatus.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/CallableStatus.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/CallableStatus.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalMetric.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalMetric.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalMetric.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalMetric.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStratification.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStratification.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStratification.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStratification.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetric.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetric.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetric.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetric.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricCoverageGap.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricCoverageGap.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricCoverageGap.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricCoverageGap.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricExcessiveCoverage.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricExcessiveCoverage.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricExcessiveCoverage.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricExcessiveCoverage.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricLowCoverage.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricLowCoverage.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricLowCoverage.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricLowCoverage.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricPoorQuality.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricPoorQuality.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricPoorQuality.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricPoorQuality.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStratification.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStratification.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStratification.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStratification.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Metric.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Metric.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Metric.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Metric.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/PluginUtils.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/PluginUtils.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/PluginUtils.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/PluginUtils.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetric.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetric.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetric.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetric.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetricBadMates.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetricBadMates.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetricBadMates.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetricBadMates.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetricNoReads.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetricNoReads.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetricNoReads.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetricNoReads.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleStratification.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleStratification.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleStratification.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleStratification.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/Metrics.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/Metrics.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/Metrics.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/Metrics.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BaseMismatchModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/BaseMismatchModel.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BaseMismatchModel.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/BaseMismatchModel.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypePriors.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypePriors.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypePriors.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypePriors.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypePriors.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypePriors.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypePriors.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypePriors.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ProbabilityVector.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/ProbabilityVector.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ProbabilityVector.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/ProbabilityVector.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java new file mode 100644 index 000000000..aa334f680 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -0,0 +1,844 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalc; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcResult; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.baq.BAQ; +import org.broadinstitute.sting.utils.classloader.PluginManager; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.variant.vcf.VCFConstants; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.variant.variantcontext.*; + +import java.io.PrintStream; +import java.lang.reflect.Constructor; +import java.util.*; + +public class UnifiedGenotyperEngine { + public static final String LOW_QUAL_FILTER_NAME = "LowQual"; + private static final String GPSTRING = "GENERALPLOIDY"; + + public static final String NUMBER_OF_DISCOVERED_ALLELES_KEY = "NDA"; + public static final String PL_FOR_ALL_SNP_ALLELES_KEY = "APL"; + + public static final double HUMAN_SNP_HETEROZYGOSITY = 1e-3; + public static final double HUMAN_INDEL_HETEROZYGOSITY = 1e-4; + + private static final int SNP_MODEL = 0; + private static final int INDEL_MODEL = 1; + + public enum OUTPUT_MODE { + /** produces calls only at variant sites */ + EMIT_VARIANTS_ONLY, + /** produces calls at variant sites and confident reference sites */ + EMIT_ALL_CONFIDENT_SITES, + /** produces calls at any callable site regardless of confidence; this argument is intended only for point + * mutations (SNPs) in DISCOVERY mode or generally when running in GENOTYPE_GIVEN_ALLELES mode; it will by + * no means produce a comprehensive set of indels in DISCOVERY mode */ + EMIT_ALL_SITES + } + + // the unified argument collection + private final UnifiedArgumentCollection UAC; + public UnifiedArgumentCollection getUAC() { return UAC; } + + // the annotation engine + private final VariantAnnotatorEngine annotationEngine; + + // the model used for calculating genotypes + private ThreadLocal> glcm = new ThreadLocal>(); + private final List modelsToUse = new ArrayList(2); + + // the model used for calculating p(non-ref) + private ThreadLocal afcm = new ThreadLocal(); + + // because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything + private final double[] log10AlleleFrequencyPriorsSNPs; + private final double[] log10AlleleFrequencyPriorsIndels; + + // samples in input + private final Set samples; + + // the various loggers and writers + private final Logger logger; + private final PrintStream verboseWriter; + + // number of chromosomes (ploidy * samples) in input + private final int ploidy; + private final int N; + + // the standard filter to use for calls below the confidence threshold but above the emit threshold + private static final Set filter = new HashSet(1); + + private final GenomeLocParser genomeLocParser; + private final boolean BAQEnabledOnCMDLine; + + // --------------------------------------------------------------------------------------------------------- + // + // Public interface functions + // + // --------------------------------------------------------------------------------------------------------- + @Requires({"toolkit != null", "UAC != null"}) + public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC) { + this(toolkit, UAC, Logger.getLogger(UnifiedGenotyperEngine.class), null, null, SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()), GATKVariantContextUtils.DEFAULT_PLOIDY); + } + + protected UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, Set samples, UnifiedArgumentCollection UAC) { + this(toolkit, UAC, Logger.getLogger(UnifiedGenotyperEngine.class), null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY); + } + + @Requires({"toolkit != null", "UAC != null", "logger != null", "samples != null && samples.size() > 0","ploidy>0"}) + public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, Set samples, int ploidy) { + this.BAQEnabledOnCMDLine = toolkit.getArguments().BAQMode != BAQ.CalculationMode.OFF; + genomeLocParser = toolkit.getGenomeLocParser(); + this.samples = new TreeSet(samples); + // note that, because we cap the base quality by the mapping quality, minMQ cannot be less than minBQ + this.UAC = UAC; + + this.logger = logger; + this.verboseWriter = verboseWriter; + this.annotationEngine = engine; + + this.ploidy = ploidy; + this.N = samples.size() * ploidy; + log10AlleleFrequencyPriorsSNPs = new double[N+1]; + log10AlleleFrequencyPriorsIndels = new double[N+1]; + computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity,UAC.inputPrior); + computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY, UAC.inputPrior); + + filter.add(LOW_QUAL_FILTER_NAME); + + determineGLModelsToUse(); + + // do argument checking + if (UAC.annotateAllSitesWithPLs) { + if (!modelsToUse.contains(GenotypeLikelihoodsCalculationModel.Model.SNP)) + throw new IllegalArgumentException("Invalid genotype likelihood model specification: Only diploid SNP model can be used in conjunction with option allSitePLs"); + + } + } + + /** + * @see #calculateLikelihoodsAndGenotypes(org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker, org.broadinstitute.sting.gatk.contexts.ReferenceContext, org.broadinstitute.sting.gatk.contexts.AlignmentContext, java.util.Set) + * + * same as the full call but with allSamples == null + * + * @param tracker the meta data tracker + * @param refContext the reference base + * @param rawContext contextual information around the locus + * @return the VariantCallContext object + */ + public List calculateLikelihoodsAndGenotypes(final RefMetaDataTracker tracker, + final ReferenceContext refContext, + final AlignmentContext rawContext) { + return calculateLikelihoodsAndGenotypes(tracker, refContext, rawContext, null); + } + + + /** + * Compute full calls at a given locus. Entry point for engine calls from the UnifiedGenotyper. + * + * If allSamples != null, then the output variantCallContext is guarenteed to contain a genotype + * for every sample in allSamples. If it's null there's no such guarentee. Providing this + * argument is critical when the resulting calls will be written to a VCF file. + * + * @param tracker the meta data tracker + * @param refContext the reference base + * @param rawContext contextual information around the locus + * @param allSamples set of all sample names that we might call (i.e., those in the VCF header) + * @return the VariantCallContext object + */ + public List calculateLikelihoodsAndGenotypes(final RefMetaDataTracker tracker, + final ReferenceContext refContext, + final AlignmentContext rawContext, + final Set allSamples) { + final List results = new ArrayList(2); + + final List models = getGLModelsToUse(tracker, refContext, rawContext); + + final Map perReadAlleleLikelihoodMap = new HashMap(); + + if ( models.isEmpty() ) { + results.add(UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? generateEmptyContext(tracker, refContext, null, rawContext) : null); + } + else { + for ( final GenotypeLikelihoodsCalculationModel.Model model : models ) { + perReadAlleleLikelihoodMap.clear(); + final Map stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, model); + if ( stratifiedContexts == null ) { + results.add(UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? generateEmptyContext(tracker, refContext, null, rawContext) : null); + } + else { + final VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model, perReadAlleleLikelihoodMap); + if ( vc != null ) + results.add(calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, true, perReadAlleleLikelihoodMap)); +// todo - uncomment if we want to also emit a null ref call (with no QUAL) if there's no evidence for REF and if EMIT_ALL_SITES is set +// else if (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES) +// results.add(generateEmptyContext(tracker, refContext, null, rawContext)); + + } + } + } + + return results; + } + + /** + * Compute GLs at a given locus. Entry point for engine calls from UGCalcLikelihoods. + * + * @param tracker the meta data tracker + * @param refContext the reference base + * @param rawContext contextual information around the locus + * @param perReadAlleleLikelihoodMap Map to store per-sample, per-read, per-allele likelihoods (only used for indels) + * @return the VariantContext object + */ + public VariantContext calculateLikelihoods(final RefMetaDataTracker tracker, + final ReferenceContext refContext, + final AlignmentContext rawContext, + final Map perReadAlleleLikelihoodMap) { + final List models = getGLModelsToUse(tracker, refContext, rawContext); + if ( models.isEmpty() ) { + return null; + } + + for ( final GenotypeLikelihoodsCalculationModel.Model model : models ) { + final Map stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, model); + // return the first valid one we encounter + if ( stratifiedContexts != null ) + return calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model, perReadAlleleLikelihoodMap); + + } + + return null; + } + + /** + * Compute genotypes at a given locus. Entry point for engine calls from UGCallVariants. + * + * @param tracker the meta data tracker + * @param refContext the reference base + * @param rawContext contextual information around the locus + * @param vc the GL-annotated variant context + * @return the VariantCallContext object + */ + public VariantCallContext calculateGenotypes(final RefMetaDataTracker tracker, + final ReferenceContext refContext, + final AlignmentContext rawContext, + final VariantContext vc) { + final List models = getGLModelsToUse(tracker, refContext, rawContext); + if ( models.isEmpty() ) { + return null; + } + + // return the first one + final GenotypeLikelihoodsCalculationModel.Model model = models.get(0); + final Map stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, model); + return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, null); + } + + /** + * Compute genotypes at a given locus. + * + * @param vc the GL-annotated variant context + * @return the VariantCallContext object + */ + public VariantCallContext calculateGenotypes(VariantContext vc) { + return calculateGenotypes(null, null, null, null, vc, GenotypeLikelihoodsCalculationModel.Model.valueOf("SNP"), null); + } + + + // --------------------------------------------------------------------------------------------------------- + // + // Private implementation helpers + // + // --------------------------------------------------------------------------------------------------------- + + // private method called by both UnifiedGenotyper and UGCalcLikelihoods entry points into the engine + private VariantContext calculateLikelihoods(final RefMetaDataTracker tracker, + final ReferenceContext refContext, + final Map stratifiedContexts, + final AlignmentContextUtils.ReadOrientation type, + final List alternateAllelesToUse, + final boolean useBAQedPileup, + final GenotypeLikelihoodsCalculationModel.Model model, + final Map perReadAlleleLikelihoodMap) { + + // initialize the data for this thread if that hasn't been done yet + if ( glcm.get() == null ) { + glcm.set(getGenotypeLikelihoodsCalculationObject(logger, UAC)); + } + + return glcm.get().get(model.name()).getLikelihoods(tracker, refContext, stratifiedContexts, type, alternateAllelesToUse, useBAQedPileup && BAQEnabledOnCMDLine, genomeLocParser, perReadAlleleLikelihoodMap); + } + + private VariantCallContext generateEmptyContext(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, AlignmentContext rawContext) { + VariantContext vc; + if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { + VariantContext vcInput = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, rawContext.getLocation(), false, logger, UAC.alleles); + if ( vcInput == null ) + return null; + vc = new VariantContextBuilder("UG_call", ref.getLocus().getContig(), vcInput.getStart(), vcInput.getEnd(), vcInput.getAlleles()).make(); + } else { + // deal with bad/non-standard reference bases + if ( !Allele.acceptableAlleleBases(new byte[]{ref.getBase()}) ) + return null; + + Set alleles = new HashSet(); + alleles.add(Allele.create(ref.getBase(), true)); + vc = new VariantContextBuilder("UG_call", ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStart(), alleles).make(); + } + + if ( annotationEngine != null ) { + // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations + final ReadBackedPileup pileup = rawContext.getBasePileup(); + stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); + + vc = annotationEngine.annotateContext(tracker, ref, stratifiedContexts, vc); + } + + return new VariantCallContext(vc, false); + } + + public VariantCallContext calculateGenotypes(final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model, final Map perReadAlleleLikelihoodMap) { + return calculateGenotypes(null, null, null, null, vc, model, perReadAlleleLikelihoodMap); + } + + public VariantCallContext calculateGenotypes(final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model) { + return calculateGenotypes(null, null, null, null, vc, model, null); + } + + public VariantCallContext calculateGenotypes(final RefMetaDataTracker tracker, + final ReferenceContext refContext, + final AlignmentContext rawContext, + final Map stratifiedContexts, + final VariantContext vc, + final GenotypeLikelihoodsCalculationModel.Model model, + final Map perReadAlleleLikelihoodMap) { + return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, false, perReadAlleleLikelihoodMap); + } + + /** + * Main entry function to calculate genotypes of a given VC with corresponding GL's + * @param tracker Tracker + * @param refContext Reference context + * @param rawContext Raw context + * @param stratifiedContexts Stratified alignment contexts + * @param vc Input VC + * @param model GL calculation model + * @param inheritAttributesFromInputVC Output VC will contain attributes inherited from input vc + * @return VC with assigned genotypes + */ + public VariantCallContext calculateGenotypes(final RefMetaDataTracker tracker, final ReferenceContext refContext, + final AlignmentContext rawContext, Map stratifiedContexts, + final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model, + final boolean inheritAttributesFromInputVC, + final Map perReadAlleleLikelihoodMap) { + + boolean limitedContext = tracker == null || refContext == null || rawContext == null || stratifiedContexts == null; + + // TODO TODO TODO TODO + // REFACTOR THIS FUNCTION, TOO UNWIELDY!! + + // initialize the data for this thread if that hasn't been done yet + if ( afcm.get() == null ) { + afcm.set(AFCalcFactory.createAFCalc(UAC, N, logger)); + } + + // if input VC can't be genotyped, exit with either null VCC or, in case where we need to emit all sites, an empty call + if (!canVCbeGenotyped(vc)) { + if (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && !limitedContext) + return generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext); + else + return null; + + } + + // estimate our confidence in a reference call and return + if ( vc.getNSamples() == 0 ) { + if ( limitedContext ) + return null; + return (UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES ? + estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), false, 1.0) : + generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext)); + } + + final AFCalcResult AFresult = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model)); + + // is the most likely frequency conformation AC=0 for all alternate alleles? + boolean bestGuessIsRef = true; + + // determine which alternate alleles have AF>0 + final List myAlleles = new ArrayList<>(vc.getAlleles().size()); + final List alleleCountsofMLE = new ArrayList<>(vc.getAlleles().size()); + myAlleles.add(vc.getReference()); + for ( int i = 0; i < AFresult.getAllelesUsedInGenotyping().size(); i++ ) { + final Allele alternateAllele = AFresult.getAllelesUsedInGenotyping().get(i); + if ( alternateAllele.isReference() ) + continue; + + // Compute if the site is considered polymorphic with sufficient confidence relative to our + // phred-scaled emission QUAL + final boolean isNonRef = AFresult.isPolymorphicPhredScaledQual(alternateAllele, UAC.STANDARD_CONFIDENCE_FOR_EMITTING); + final boolean toInclude = isNonRef || alternateAllele == GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE || + UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES || + UAC.annotateAllSitesWithPLs; + + bestGuessIsRef &= !isNonRef; + + if (toInclude) { + myAlleles.add(alternateAllele); + alleleCountsofMLE.add(AFresult.getAlleleCountAtMLE(alternateAllele)); + } + } + + final double PoFGT0 = Math.pow(10, AFresult.getLog10PosteriorOfAFGT0()); + + // note the math.abs is necessary because -10 * 0.0 => -0.0 which isn't nice + final double phredScaledConfidence = + Math.abs(! bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES || UAC.annotateAllSitesWithPLs + ? -10 * AFresult.getLog10PosteriorOfAFEq0() + : -10 * AFresult.getLog10PosteriorOfAFGT0()); + + // return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero + if ( UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES && !passesEmitThreshold(phredScaledConfidence, bestGuessIsRef) ) { + // technically, at this point our confidence in a reference call isn't accurately estimated + // because it didn't take into account samples with no data, so let's get a better estimate + return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), true, PoFGT0); + } + + // start constructing the resulting VC + final GenomeLoc loc = genomeLocParser.createGenomeLoc(vc); + final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), myAlleles); + builder.log10PError(phredScaledConfidence/-10.0); + if ( ! passesCallThreshold(phredScaledConfidence) ) + builder.filters(filter); + + // create the genotypes + final GenotypesContext genotypes = afcm.get().subsetAlleles(vc, myAlleles, true,ploidy); + builder.genotypes(genotypes); + + // print out stats if we have a writer + if ( verboseWriter != null && !limitedContext ) + printVerboseData(refContext.getLocus().toString(), vc, PoFGT0, phredScaledConfidence, model); + + // *** note that calculating strand bias involves overwriting data structures, so we do that last + final HashMap attributes = new HashMap(); + + // inherit attributed from input vc if requested + if (inheritAttributesFromInputVC) + attributes.putAll(vc.getAttributes()); + // if the site was downsampled, record that fact + if ( !limitedContext && rawContext.hasPileupBeenDownsampled() ) + attributes.put(VCFConstants.DOWNSAMPLED_KEY, true); + + if ( UAC.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED ) + attributes.put(NUMBER_OF_DISCOVERED_ALLELES_KEY, vc.getAlternateAlleles().size()); + + // add the MLE AC and AF annotations + if ( alleleCountsofMLE.size() > 0 ) { + attributes.put(VCFConstants.MLE_ALLELE_COUNT_KEY, alleleCountsofMLE); + final int AN = builder.make().getCalledChrCount(); + final ArrayList MLEfrequencies = new ArrayList(alleleCountsofMLE.size()); + // the MLEAC is allowed to be larger than the AN (e.g. in the case of all PLs being 0, the GT is ./. but the exact model may arbitrarily choose an AC>1) + for ( int AC : alleleCountsofMLE ) + MLEfrequencies.add(Math.min(1.0, (double)AC / (double)AN)); + attributes.put(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, MLEfrequencies); + } + + if ( UAC.COMPUTE_SLOD && !limitedContext && !bestGuessIsRef ) { + //final boolean DEBUG_SLOD = false; + + // the overall lod + //double overallLog10PofNull = AFresult.log10AlleleFrequencyPosteriors[0]; + final double overallLog10PofF = AFresult.getLog10LikelihoodOfAFGT0(); + //if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF); + + final List allAllelesToUse = builder.make().getAlleles(); + + // the forward lod + final VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, allAllelesToUse, false, model, perReadAlleleLikelihoodMap); + final AFCalcResult forwardAFresult = afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model)); + //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); + final double forwardLog10PofNull = forwardAFresult.getLog10LikelihoodOfAFEq0(); + final double forwardLog10PofF = forwardAFresult.getLog10LikelihoodOfAFGT0(); + //if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF); + + // the reverse lod + final VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, allAllelesToUse, false, model, perReadAlleleLikelihoodMap); + final AFCalcResult reverseAFresult = afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model)); + //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); + final double reverseLog10PofNull = reverseAFresult.getLog10LikelihoodOfAFEq0(); + final double reverseLog10PofF = reverseAFresult.getLog10LikelihoodOfAFGT0(); + //if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF); + + final double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF; + final double reverseLod = reverseLog10PofF + forwardLog10PofNull - overallLog10PofF; + //if ( DEBUG_SLOD ) System.out.println("forward lod=" + forwardLod + ", reverse lod=" + reverseLod); + + // strand score is max bias between forward and reverse strands + double strandScore = Math.max(forwardLod, reverseLod); + // rescale by a factor of 10 + strandScore *= 10.0; + //logger.debug(String.format("SLOD=%f", strandScore)); + + if ( !Double.isNaN(strandScore) ) + attributes.put("SB", strandScore); + } + + // finish constructing the resulting VC + builder.attributes(attributes); + VariantContext vcCall = builder.make(); + + if ( annotationEngine != null && !limitedContext ) { // limitedContext callers need to handle annotations on their own by calling their own annotationEngine + // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations + final ReadBackedPileup pileup = rawContext.getBasePileup(); + stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); + + vcCall = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall, perReadAlleleLikelihoodMap); + } + + // if we are subsetting alleles (either because there were too many or because some were not polymorphic) + // then we may need to trim the alleles (because the original VariantContext may have had to pad at the end). + if ( myAlleles.size() != vc.getAlleles().size() && !limitedContext ) // limitedContext callers need to handle allele trimming on their own to keep their perReadAlleleLikelihoodMap alleles in sync + vcCall = GATKVariantContextUtils.reverseTrimAlleles(vcCall); + + return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PoFGT0)); + } + + /** + * Determine whether input VC to calculateGenotypes() can be genotyped and AF can be computed. + * @param vc Input VC + * @return Status check + */ + @Requires("vc != null") + protected boolean canVCbeGenotyped(final VariantContext vc) { + // protect against too many alternate alleles that we can't even run AF on: + if (vc.getNAlleles()> GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED) { + logger.warn("Attempting to genotype more than "+GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED + + " alleles. Site will be skipped at location "+vc.getChr()+":"+vc.getStart()); + return false; + } + else return true; + + } + + private Map getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext, final GenotypeLikelihoodsCalculationModel.Model model) { + + if ( !BaseUtils.isRegularBase(refContext.getBase()) ) + return null; + + Map stratifiedContexts = null; + + if ( model.name().contains("INDEL") ) { + + final ReadBackedPileup pileup = rawContext.getBasePileup().getMappingFilteredPileup(UAC.MIN_BASE_QUALTY_SCORE); + // don't call when there is no coverage + if ( pileup.getNumberOfElements() == 0 && UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES ) + return null; + + // stratify the AlignmentContext and cut by sample + stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); + + } else if ( model.name().contains("SNP") ) { + + // stratify the AlignmentContext and cut by sample + stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(rawContext.getBasePileup()); + + if ( !(UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) ) { + int numDeletions = 0; + for ( final PileupElement p : rawContext.getBasePileup() ) { + if ( p.isDeletion() ) + numDeletions += p.getRepresentativeCount(); + } + if ( ((double) numDeletions) / ((double) rawContext.getBasePileup().depthOfCoverage()) > UAC.MAX_DELETION_FRACTION ) { + return null; + } + } + } + + return stratifiedContexts; + } + + private final double getRefBinomialProbLog10(final int depth) { + return MathUtils.log10BinomialProbability(depth, 0); + } + + private VariantCallContext estimateReferenceConfidence(VariantContext vc, Map contexts, double theta, boolean ignoreCoveredSamples, double initialPofRef) { + if ( contexts == null ) + return null; + + double log10POfRef = Math.log10(initialPofRef); + + // for each sample that we haven't examined yet + for ( String sample : samples ) { + final AlignmentContext context = contexts.get(sample); + if ( ignoreCoveredSamples && context != null ) + continue; + final int depth = context == null ? 0 : context.getBasePileup().depthOfCoverage(); + log10POfRef += estimateLog10ReferenceConfidenceForOneSample(depth, theta); + } + + return new VariantCallContext(vc, QualityUtils.phredScaleLog10CorrectRate(log10POfRef) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING, false); + } + + /** + * Compute the log10 probability of a sample with sequencing depth and no alt allele is actually truly homozygous reference + * + * Assumes the sample is diploid + * + * @param depth the depth of the sample + * @param theta the heterozygosity of this species (between 0 and 1) + * @return a valid log10 probability of the sample being hom-ref + */ + @Requires({"depth >= 0", "theta >= 0.0 && theta <= 1.0"}) + @Ensures("MathUtils.goodLog10Probability(result)") + protected double estimateLog10ReferenceConfidenceForOneSample(final int depth, final double theta) { + final double log10PofNonRef = Math.log10(theta / 2.0) + getRefBinomialProbLog10(depth); + return MathUtils.log10OneMinusX(Math.pow(10.0, log10PofNonRef)); + } + + protected void printVerboseData(String pos, VariantContext vc, double PofF, double phredScaledConfidence, final GenotypeLikelihoodsCalculationModel.Model model) { + Allele refAllele = null, altAllele = null; + for ( Allele allele : vc.getAlleles() ) { + if ( allele.isReference() ) + refAllele = allele; + else + altAllele = allele; + } + + for (int i = 0; i <= N; i++) { + StringBuilder AFline = new StringBuilder("AFINFO\t"); + AFline.append(pos); + AFline.append("\t"); + AFline.append(refAllele); + AFline.append("\t"); + if ( altAllele != null ) + AFline.append(altAllele); + else + AFline.append("N/A"); + AFline.append("\t"); + AFline.append(i + "/" + N + "\t"); + AFline.append(String.format("%.2f\t", ((float)i)/N)); + AFline.append(String.format("%.8f\t", getAlleleFrequencyPriors(model)[i])); + verboseWriter.println(AFline.toString()); + } + + verboseWriter.println("P(f>0) = " + PofF); + verboseWriter.println("Qscore = " + phredScaledConfidence); + verboseWriter.println(); + } + + protected boolean passesEmitThreshold(double conf, boolean bestGuessIsRef) { + return (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_CONFIDENT_SITES || !bestGuessIsRef) && conf >= Math.min(UAC.STANDARD_CONFIDENCE_FOR_CALLING, UAC.STANDARD_CONFIDENCE_FOR_EMITTING); + } + + protected boolean passesCallThreshold(double conf) { + return conf >= UAC.STANDARD_CONFIDENCE_FOR_CALLING; + } + + protected boolean confidentlyCalled(double conf, double PofF) { + return conf >= UAC.STANDARD_CONFIDENCE_FOR_CALLING || + (UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES && QualityUtils.phredScaleErrorRate(PofF) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING); + } + + private void determineGLModelsToUse() { + String modelPrefix = ""; + if ( !UAC.GLmodel.name().contains(GPSTRING) && UAC.samplePloidy != GATKVariantContextUtils.DEFAULT_PLOIDY ) + modelPrefix = GPSTRING; + + // GGA mode => must initialize both the SNP and indel models + if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES || + UAC.GLmodel.name().toUpperCase().contains("BOTH") ) { + modelsToUse.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"SNP")); + modelsToUse.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"INDEL")); + } + else { + modelsToUse.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+UAC.GLmodel.name().toUpperCase())); + } + } + + // decide whether we are currently processing SNPs, indels, neither, or both + private List getGLModelsToUse(final RefMetaDataTracker tracker, + final ReferenceContext refContext, + final AlignmentContext rawContext) { + if ( UAC.GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) + return modelsToUse; + + if ( modelsToUse.size() != 2 ) + throw new IllegalStateException("GGA mode assumes that we have initialized both the SNP and indel models but found " + modelsToUse); + + // if we're genotyping given alleles then we need to choose the model corresponding to the variant type requested + final VariantContext vcInput = getVCFromAllelesRod(tracker, refContext, rawContext.getLocation(), false, logger, UAC.alleles); + + if ( vcInput == null ) { + return Collections.emptyList(); // no work to be done + } else if ( vcInput.isSNP() ) { + return Collections.singletonList(modelsToUse.get(SNP_MODEL)); + } else if ( vcInput.isIndel() || vcInput.isMixed() ) { + return Collections.singletonList(modelsToUse.get(INDEL_MODEL)); + } else { + return Collections.emptyList(); // No support for other types yet + } + } + + /** + * Function that fills vector with allele frequency priors. By default, infinite-sites, neutral variation prior is used, + * where Pr(AC=i) = theta/i where theta is heterozygosity + * @param N Number of chromosomes + * @param priors (output) array to be filled with priors + * @param heterozygosity default heterozygosity to use, if inputPriors is empty + * @param inputPriors Input priors to use (in which case heterozygosity is ignored) + */ + public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double heterozygosity, final List inputPriors) { + + + double sum = 0.0; + + if (!inputPriors.isEmpty()) { + // user-specified priors + if (inputPriors.size() != N) + throw new UserException.BadArgumentValue("inputPrior","Invalid length of inputPrior vector: vector length must be equal to # samples +1 "); + + int idx = 1; + for (final double prior: inputPriors) { + if (prior < 0.0) + throw new UserException.BadArgumentValue("Bad argument: negative values not allowed","inputPrior"); + priors[idx++] = Math.log10(prior); + sum += prior; + } + } + else { + // for each i + for (int i = 1; i <= N; i++) { + final double value = heterozygosity / (double)i; + priors[i] = Math.log10(value); + sum += value; + } + } + + // protection against the case of heterozygosity too high or an excessive number of samples (which break population genetics assumptions) + if (sum > 1.0) { + throw new UserException.BadArgumentValue("heterozygosity","The heterozygosity value is set too high relative to the number of samples to be processed, or invalid values specified if input priors were provided - try reducing heterozygosity value or correct input priors."); + } + // null frequency for AF=0 is (1 - sum(all other frequencies)) + priors[0] = Math.log10(1.0 - sum); + } + + protected double[] getAlleleFrequencyPriors( final GenotypeLikelihoodsCalculationModel.Model model ) { + if (model.name().toUpperCase().contains("SNP")) + return log10AlleleFrequencyPriorsSNPs; + else if (model.name().toUpperCase().contains("INDEL")) + return log10AlleleFrequencyPriorsIndels; + else + throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + model); + + } + + protected double getTheta( final GenotypeLikelihoodsCalculationModel.Model model ) { + if( model.name().contains("SNP") ) + return HUMAN_SNP_HETEROZYGOSITY; + if( model.name().contains("INDEL") ) + return HUMAN_INDEL_HETEROZYGOSITY; + else throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + model); + } + + private static Map getGenotypeLikelihoodsCalculationObject(Logger logger, UnifiedArgumentCollection UAC) { + + final Map glcm = new HashMap(); + final List> glmClasses = new PluginManager(GenotypeLikelihoodsCalculationModel.class).getPlugins(); + + for (int i = 0; i < glmClasses.size(); i++) { + final Class glmClass = glmClasses.get(i); + final String key = glmClass.getSimpleName().replaceAll("GenotypeLikelihoodsCalculationModel","").toUpperCase(); + try { + final Object args[] = new Object[]{UAC,logger}; + final Constructor c = glmClass.getDeclaredConstructor(UnifiedArgumentCollection.class, Logger.class); + glcm.put(key, (GenotypeLikelihoodsCalculationModel)c.newInstance(args)); + } + catch (Exception e) { + throw new UserException("The likelihoods model provided for the -glm argument (" + UAC.GLmodel + ") is not a valid option: " + e.getMessage()); + } + } + + return glcm; + } + + public static VariantContext getVCFromAllelesRod(RefMetaDataTracker tracker, ReferenceContext ref, GenomeLoc loc, boolean requireSNP, Logger logger, final RodBinding allelesBinding) { + if ( tracker == null || ref == null || logger == null ) + return null; + VariantContext vc = null; + + // search for usable record + for ( final VariantContext vc_input : tracker.getValues(allelesBinding, loc) ) { + if ( vc_input != null && ! vc_input.isFiltered() && (! requireSNP || vc_input.isSNP() )) { + if ( vc == null ) { + vc = vc_input; + } else { + logger.warn("Multiple valid VCF records detected in the alleles input file at site " + ref.getLocus() + ", only considering the first record"); + } + } + } + + return vc; + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/VariantCallContext.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/VariantCallContext.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/VariantCallContext.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/VariantCallContext.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java new file mode 100644 index 000000000..b778195a9 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java @@ -0,0 +1,333 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.GenotypeLikelihoods; +import org.broadinstitute.variant.variantcontext.GenotypesContext; +import org.broadinstitute.variant.variantcontext.VariantContext; + +import java.util.*; + +public abstract class DiploidExactAFCalc extends ExactAFCalc { + public DiploidExactAFCalc(final int nSamples, final int maxAltAlleles, final int ploidy) { + super(nSamples, maxAltAlleles, ploidy); + if ( ploidy != 2 ) throw new IllegalArgumentException("ploidy must be two for DiploidExactAFCalc and subclasses but saw " + ploidy); + } + + @Override + protected AFCalcResult computeLog10PNonRef(final VariantContext vc, + final double[] log10AlleleFrequencyPriors) { + final int numAlternateAlleles = vc.getNAlleles() - 1; + final ArrayList genotypeLikelihoods = getGLs(vc.getGenotypes(), true); + final int numSamples = genotypeLikelihoods.size()-1; + final int numChr = 2*numSamples; + + // queue of AC conformations to process + final LinkedList ACqueue = new LinkedList<>(); + + // mapping of ExactACset indexes to the objects + final HashMap indexesToACset = new HashMap<>(numChr+1); + + // add AC=0 to the queue + final int[] zeroCounts = new int[numAlternateAlleles]; + ExactACset zeroSet = new ExactACset(numSamples+1, new ExactACcounts(zeroCounts)); + ACqueue.add(zeroSet); + indexesToACset.put(zeroSet.getACcounts(), zeroSet); + + while ( !ACqueue.isEmpty() ) { + getStateTracker().incNEvaluations(); // keep track of the number of evaluations + + // compute log10Likelihoods + final ExactACset set = ACqueue.remove(); + + calculateAlleleCountConformation(set, genotypeLikelihoods, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors); + + // clean up memory + indexesToACset.remove(set.getACcounts()); + //if ( DEBUG ) + // System.out.printf(" *** removing used set=%s%n", set.ACcounts); + } + + return getResultFromFinalState(vc, log10AlleleFrequencyPriors); + } + + + @Override + protected GenotypesContext reduceScopeGenotypes(final VariantContext vc, final List allelesToUse) { + return GATKVariantContextUtils.subsetDiploidAlleles(vc, allelesToUse, GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL); + } + + @Override + protected void reduceScopeCalculateLikelihoodSums(final VariantContext vc, final LikelihoodSum[] likelihoodSums) { + final ArrayList GLs = getGLs(vc.getGenotypes(), true); + for ( final double[] likelihoods : GLs ) { + final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods); + if ( PLindexOfBestGL != PL_INDEX_OF_HOM_REF ) { + final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindexOfBestGL); + final int alleleLikelihoodIndex1 = alleles.alleleIndex1 - 1; + final int alleleLikelihoodIndex2 = alleles.alleleIndex2 - 1; + if ( alleles.alleleIndex1 != 0 ) + likelihoodSums[alleleLikelihoodIndex1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]; + // don't double-count it + if ( alleles.alleleIndex2 != 0 && alleles.alleleIndex2 != alleles.alleleIndex1 ) + likelihoodSums[alleleLikelihoodIndex2].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]; + } + } + } + + private static final class DependentSet { + public final int[] ACcounts; + public final int PLindex; + + public DependentSet(final int[] ACcounts, final int PLindex) { + this.ACcounts = ACcounts; + this.PLindex = PLindex; + } + } + + private double calculateAlleleCountConformation(final ExactACset set, + final ArrayList genotypeLikelihoods, + final int numChr, + final LinkedList ACqueue, + final HashMap indexesToACset, + final double[] log10AlleleFrequencyPriors) { + + //if ( DEBUG ) + // System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts); + + // compute the log10Likelihoods + computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors); + + final double log10LofK = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1]; + + // can we abort early because the log10Likelihoods are so small? + if ( getStateTracker().abort(log10LofK, set.getACcounts(), true) ) { + //if ( DEBUG ) + // System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L); + return log10LofK; + } + + // iterate over higher frequencies if possible + final int ACwiggle = numChr - set.getACsum(); + if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies + return log10LofK; + + final int numAltAlleles = set.getACcounts().getCounts().length; + + // add conformations for the k+1 case + for ( int allele = 0; allele < numAltAlleles; allele++ ) { + final int[] ACcountsClone = set.getACcounts().getCounts().clone(); + ACcountsClone[allele]++; + // to get to this conformation, a sample would need to be AB (remember that ref=0) + final int PLindex = GenotypeLikelihoods.calculatePLindex(0, allele+1); + updateACset(ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods); + } + + // add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different + if ( ACwiggle > 1 ) { + final ArrayList differentAlleles = new ArrayList<>(numAltAlleles * numAltAlleles); + final ArrayList sameAlleles = new ArrayList<>(numAltAlleles); + + for ( int allele_i = 0; allele_i < numAltAlleles; allele_i++ ) { + for ( int allele_j = allele_i; allele_j < numAltAlleles; allele_j++ ) { + final int[] ACcountsClone = set.getACcounts().getCounts().clone(); + ACcountsClone[allele_i]++; + ACcountsClone[allele_j]++; + + // to get to this conformation, a sample would need to be BB or BC (remember that ref=0, so add one to the index) + final int PLindex = GenotypeLikelihoods.calculatePLindex(allele_i+1, allele_j+1); + if ( allele_i == allele_j ) + sameAlleles.add(new DependentSet(ACcountsClone, PLindex)); + else + differentAlleles.add(new DependentSet(ACcountsClone, PLindex)); + } + } + + // IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering + for ( DependentSet dependent : differentAlleles ) + updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); + for ( DependentSet dependent : sameAlleles ) + updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); + } + + return log10LofK; + } + + // adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and + // also pushes its value to the given callingSetIndex. + private void updateACset(final int[] newSetCounts, + final int numChr, + final ExactACset dependentSet, + final int PLsetIndex, + final Queue ACqueue, + final HashMap indexesToACset, + final ArrayList genotypeLikelihoods) { + final ExactACcounts index = new ExactACcounts(newSetCounts); + if ( !indexesToACset.containsKey(index) ) { + ExactACset set = new ExactACset(numChr/2 +1, index); + indexesToACset.put(index, set); + ACqueue.add(set); + } + + // push data from the dependency to the new set + //if ( DEBUG ) + // System.out.println(" *** pushing data from " + index + " to " + dependencySet.ACcounts); + pushData(indexesToACset.get(index), dependentSet, PLsetIndex, genotypeLikelihoods); + } + + private void computeLofK(final ExactACset set, + final ArrayList genotypeLikelihoods, + final double[] log10AlleleFrequencyPriors) { + + set.getLog10Likelihoods()[0] = 0.0; // the zero case + final int totalK = set.getACsum(); + + // special case for k = 0 over all k + if ( totalK == 0 ) { + for ( int j = 1; j < set.getLog10Likelihoods().length; j++ ) + set.getLog10Likelihoods()[j] = set.getLog10Likelihoods()[j-1] + genotypeLikelihoods.get(j)[HOM_REF_INDEX]; + + final double log10Lof0 = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1]; + getStateTracker().setLog10LikelihoodOfAFzero(log10Lof0); + getStateTracker().setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); + return; + } + + // if we got here, then k > 0 for at least one k. + // the non-AA possible conformations were already dealt with by pushes from dependent sets; + // now deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value + for ( int j = 1; j < set.getLog10Likelihoods().length; j++ ) { + + if ( totalK < 2*j-1 ) { + final double[] gl = genotypeLikelihoods.get(j); + final double conformationValue = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.getLog10Likelihoods()[j-1] + gl[HOM_REF_INDEX]; + set.getLog10Likelihoods()[j] = MathUtils.approximateLog10SumLog10(set.getLog10Likelihoods()[j], conformationValue); + } + + final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; + set.getLog10Likelihoods()[j] = set.getLog10Likelihoods()[j] - logDenominator; + } + + double log10LofK = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1]; + + // update the MLE if necessary + getStateTracker().updateMLEifNeeded(log10LofK, set.getACcounts().getCounts()); + + // apply the priors over each alternate allele + for ( final int ACcount : set.getACcounts().getCounts() ) { + if ( ACcount > 0 ) + log10LofK += log10AlleleFrequencyPriors[ACcount]; + } + + getStateTracker().updateMAPifNeeded(log10LofK, set.getACcounts().getCounts()); + } + + private void pushData(final ExactACset targetSet, + final ExactACset dependentSet, + final int PLsetIndex, + final ArrayList genotypeLikelihoods) { + final int totalK = targetSet.getACsum(); + + for ( int j = 1; j < targetSet.getLog10Likelihoods().length; j++ ) { + + if ( totalK <= 2*j ) { // skip impossible conformations + final double[] gl = genotypeLikelihoods.get(j); + final double conformationValue = + determineCoefficient(PLsetIndex, j, targetSet.getACcounts().getCounts(), totalK) + dependentSet.getLog10Likelihoods()[j-1] + gl[PLsetIndex]; + targetSet.getLog10Likelihoods()[j] = MathUtils.approximateLog10SumLog10(targetSet.getLog10Likelihoods()[j], conformationValue); + } + } + } + + private double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) { + // the closed form representation generalized for multiple alleles is as follows: + // AA: (2j - totalK) * (2j - totalK - 1) + // AB: 2k_b * (2j - totalK) + // AC: 2k_c * (2j - totalK) + // BB: k_b * (k_b - 1) + // BC: 2 * k_b * k_c + // CC: k_c * (k_c - 1) + + // find the 2 alleles that are represented by this PL index + GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); + + // *** note that throughout this method we subtract one from the alleleIndex because ACcounts *** + // *** doesn't consider the reference allele whereas the GenotypeLikelihoods PL cache does. *** + + // the AX het case + if ( alleles.alleleIndex1 == 0 ) + return MathUtils.log10Cache[2*ACcounts[alleles.alleleIndex2-1]] + MathUtils.log10Cache[2*j-totalK]; + + final int k_i = ACcounts[alleles.alleleIndex1-1]; + + // the hom var case (e.g. BB, CC, DD) + final double coeff; + if ( alleles.alleleIndex1 == alleles.alleleIndex2 ) { + coeff = MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_i - 1]; + } + // the het non-ref case (e.g. BC, BD, CD) + else { + final int k_j = ACcounts[alleles.alleleIndex2-1]; + coeff = MathUtils.log10Cache[2] + MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_j]; + } + + return coeff; + } + + public GenotypesContext subsetAlleles(final VariantContext vc, + final List allelesToUse, + final boolean assignGenotypes, + final int ploidy) { + return allelesToUse.size() == 1 + ? GATKVariantContextUtils.subsetToRefOnly(vc, ploidy) + : GATKVariantContextUtils.subsetDiploidAlleles(vc, allelesToUse, + assignGenotypes ? GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN : GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactACcounts.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactACcounts.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactACcounts.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactACcounts.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactACset.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactACset.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactACset.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactACset.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalc.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalc.java new file mode 100644 index 000000000..7b48b3d4d --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalc.java @@ -0,0 +1,240 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.*; + +import java.util.*; + +/** + * Uses the Exact calculation of Heng Li + */ +abstract class ExactAFCalc extends AFCalc { + protected static final int HOM_REF_INDEX = 0; // AA likelihoods are always first + /** + * Sorts {@link org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactAFCalc.LikelihoodSum} instances where those with higher likelihood are first. + */ + protected static final Comparator LIKELIHOOD_SUM_COMPARATOR = new Comparator() { + + @Override + public int compare(final LikelihoodSum o1, final LikelihoodSum o2) { + return - Double.compare(o1.sum,o2.sum); + } + }; + /** + * Sorts {@link org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactAFCalc.LikelihoodSum} instances where those with higher likelihood are first but make sure that + * NON_REF alleles are place are last. + */ + protected static final Comparator LIKELIHOOD_NON_REF_THEN_SUM_COMPARATOR = new Comparator() { + @Override + public int compare(final LikelihoodSum o1, final LikelihoodSum o2) { + if (o1.allele == GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE) + return 1; + else if (o2.allele == GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE) + return -1; + else + return o1.compareTo(o2); + } + }; + /** + * Sorts {@link org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactAFCalc.LikelihoodSum} instances where those with lower alternative allele index are first regardless of + * the likelihood sum. + */ + protected static final Comparator LIKELIHOOD_INDEX_COMPARATOR = new Comparator() { + @Override + public int compare(final LikelihoodSum o1, final LikelihoodSum o2) { + return Integer.compare(o1.index, o2.index); + } + }; + + protected ExactAFCalc(final int nSamples, int maxAltAlleles, final int ploidy) { + super(nSamples, maxAltAlleles, ploidy); + } + + /** + * Wrapper class that compares two likelihoods associated with two alleles + */ + protected static final class LikelihoodSum implements Comparable { + public double sum = 0.0; + public final Allele allele; + public final int index; + + public LikelihoodSum(final Allele allele, final int index) { this.allele = allele; this.index = index; } + + public int compareTo(LikelihoodSum other) { + final double diff = sum - other.sum; + return ( diff < 0.0 ) ? 1 : (diff > 0.0 ) ? -1 : 0; + } + } + + /** + * Unpack GenotypesContext into arraylist of doubel values + * @param GLs Input genotype context + * @return ArrayList of doubles corresponding to GL vectors + */ + protected static ArrayList getGLs(final GenotypesContext GLs, final boolean includeDummy) { + final ArrayList genotypeLikelihoods = new ArrayList<>(GLs.size() + 1); + + if ( includeDummy ) genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy + for ( Genotype sample : GLs.iterateInSampleNameOrder() ) { + if ( sample.hasLikelihoods() ) { + final double[] gls = sample.getLikelihoods().getAsVector(); + + if ( MathUtils.sum(gls) < GATKVariantContextUtils.SUM_GL_THRESH_NOCALL ) + genotypeLikelihoods.add(gls); + } + } + + return genotypeLikelihoods; + } + + @Override + protected VariantContext reduceScope(final VariantContext vc) { + // don't try to genotype too many alternate alleles + final List inputAltAlleles = vc.getAlternateAlleles(); + final List outputAltAlleles = reduceScopeAlleles(vc,getMaxAltAlleles()); + + // only if output allele has reduced from the input alt allele set size we should care. + final int altAlleleReduction = inputAltAlleles.size() - outputAltAlleles.size(); + + if (altAlleleReduction == 0) + return vc; + else if (altAlleleReduction != 0) { + logger.warn("this tool is currently set to genotype at most " + getMaxAltAlleles() + + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + + " has " + (vc.getAlternateAlleles().size()) + + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument"); + + final List alleles = new ArrayList<>(getMaxAltAlleles() + 1); + alleles.add(vc.getReference()); + alleles.addAll(reduceScopeAlleles(vc, getMaxAltAlleles())); + final VariantContextBuilder builder = new VariantContextBuilder(vc); + builder.alleles(alleles); + builder.genotypes(reduceScopeGenotypes(vc, alleles)); + if (altAlleleReduction < 0) + throw new IllegalStateException("unexpected: reduction increased the number of alt. alleles!: " + - altAlleleReduction + " " + vc + " " + builder.make()); + return builder.make(); + } else // if (altAlleleReduction < 0) + throw new IllegalStateException("unexpected: reduction increased the number of alt. alleles!: " + - altAlleleReduction + " " + vc); + } + + /** + * Returns a the new set of alleles to use. + * @param vc target variant context. + * @param numAllelesToChoose number of alleles to keep. + * @return the list of alternative allele to keep. + */ + protected List reduceScopeAlleles(final VariantContext vc, final int numAllelesToChoose) { + + // Look for the allele to exclude it from the pruning if present. + final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); + + final int nonRefAltAlleleIndex = GATKVariantContextUtils.indexOfAltAllele(vc, + GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE, false); + final boolean nonRefAltAllelePresent = nonRefAltAlleleIndex >= 0; + + // should not be considered in the downsizing, so we need to count it out when + // considering if alt. allele downsizing is required. + final int numProperOriginalAltAlleles = numOriginalAltAlleles - (nonRefAltAllelePresent ? 1 : 0); + + // Avoid pointless allele reduction: + if (numAllelesToChoose >= numProperOriginalAltAlleles) + return vc.getAlternateAlleles(); + + final LikelihoodSum[] likelihoodSums = new LikelihoodSum[numOriginalAltAlleles]; + for ( int i = 0; i < numOriginalAltAlleles; i++ ) { + final Allele allele = vc.getAlternateAllele(i); + likelihoodSums[i] = new LikelihoodSum(allele,i); + } + + // Calculate the allele likelihood sums. + reduceScopeCalculateLikelihoodSums(vc, likelihoodSums); + + // sort them by probability mass and choose the best ones + // Make sure that the allele is last if present. + Collections.sort(Arrays.asList(likelihoodSums), nonRefAltAllelePresent ? LIKELIHOOD_NON_REF_THEN_SUM_COMPARATOR : LIKELIHOOD_SUM_COMPARATOR); + + // We need to return the best likelihood alleles in the original alternative allele index order. + // This heap will keep track of that index order. + final PriorityQueue mostLikelyAllelesHeapByIndex = new PriorityQueue<>(numOriginalAltAlleles, LIKELIHOOD_INDEX_COMPARATOR); + + for ( int i = 0; i < numAllelesToChoose; i++ ) + mostLikelyAllelesHeapByIndex.add(likelihoodSums[i]); + + // guaranteed no to have been added at this point thanks for checking on whether reduction was + // needed in the first place. + if (nonRefAltAllelePresent) + mostLikelyAllelesHeapByIndex.add(likelihoodSums[nonRefAltAlleleIndex]); + + final ArrayList orderedBestAlleles = new ArrayList<>(numAllelesToChoose); + + while (!mostLikelyAllelesHeapByIndex.isEmpty()) + orderedBestAlleles.add(mostLikelyAllelesHeapByIndex.remove().allele); + + return orderedBestAlleles; + } + + protected static final int PL_INDEX_OF_HOM_REF = 0; + + /** + * Update the likelihood sums with using the variant context genotype likelihoods. + * @param vc source variant context. + * @param likelihoodSums where to update the likelihood sums. + */ + protected abstract void reduceScopeCalculateLikelihoodSums(final VariantContext vc, final LikelihoodSum[] likelihoodSums); + + /** + * Transforms the genotypes of the variant context according to the new subset of possible alleles. + * + * @param vc original variant-context. + * @param allelesToUse possible alleles. + * @return never {@code null}, the new set of genotype calls for the reduced scope. + */ + protected abstract GenotypesContext reduceScopeGenotypes(final VariantContext vc, final List allelesToUse); +} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java new file mode 100644 index 000000000..2978cb8f2 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java @@ -0,0 +1,590 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import org.broadinstitute.sting.gatk.walkers.genotyper.GeneralPloidyGenotypeLikelihoods; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.vcf.VCFConstants; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.variant.variantcontext.*; + +import java.util.*; + +public class GeneralPloidyExactAFCalc extends ExactAFCalc { + static final int MAX_LENGTH_FOR_POOL_PL_LOGGING = 10; // if PL vectors longer than this # of elements, don't log them + + private final int ploidy; + + private final static boolean VERBOSE = false; + + protected GeneralPloidyExactAFCalc(final int nSamples, final int maxAltAlleles, final int ploidy) { + super(nSamples, maxAltAlleles, ploidy); + this.ploidy = ploidy; + } + + @Override + protected GenotypesContext reduceScopeGenotypes(final VariantContext vc, final List allelesToUse) { + return subsetAlleles(vc,allelesToUse,false,ploidy); + } + + @Override + public AFCalcResult computeLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors) { + combineSinglePools(vc.getGenotypes(), vc.getNAlleles(), ploidy, log10AlleleFrequencyPriors); + return getResultFromFinalState(vc, log10AlleleFrequencyPriors); + } + + /** + * Simple wrapper class to hold values of combined pool likelihoods. + * For fast hashing and fast retrieval, there's a hash map that shadows main list. + * + */ + static class CombinedPoolLikelihoods { + private LinkedList alleleCountSetList; + private HashMap conformationMap; + private double maxLikelihood; + + + public CombinedPoolLikelihoods() { + // final int numElements = GenotypeLikelihoods.numLikelihoods(); + alleleCountSetList = new LinkedList<>(); + conformationMap = new HashMap<>(); + maxLikelihood = Double.NEGATIVE_INFINITY; + } + + public void add(ExactACset set) { + alleleCountSetList.add(set); + conformationMap.put(set.getACcounts(), set); + final double likelihood = set.getLog10Likelihoods()[0]; + + if (likelihood > maxLikelihood ) + maxLikelihood = likelihood; + + } + + public boolean hasConformation(int[] ac) { + return conformationMap.containsKey(new ExactACcounts(ac)); + + } + + public double getLikelihoodOfConformation(int[] ac) { + return conformationMap.get(new ExactACcounts(ac)).getLog10Likelihoods()[0]; + } + + public double getGLOfACZero() { + return alleleCountSetList.get(0).getLog10Likelihoods()[0]; // AC 0 is always at beginning of list + } + + public int getLength() { + return alleleCountSetList.size(); + } + } + + @Override + protected void reduceScopeCalculateLikelihoodSums(final VariantContext vc, final LikelihoodSum[] likelihoodSums) { + final int numOriginalAltAlleles = likelihoodSums.length; + final ArrayList GLs = getGLs(vc.getGenotypes(), false); + for ( final double[] likelihoods : GLs ) { + final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods); + final int[] acCount = GeneralPloidyGenotypeLikelihoods.getAlleleCountFromPLIndex(1 + numOriginalAltAlleles, ploidy, PLindexOfBestGL); + // by convention, first count coming from getAlleleCountFromPLIndex comes from reference allele + for (int k=1; k < acCount.length;k++) + if (acCount[k] > 0 ) + likelihoodSums[k-1].sum += acCount[k] * (likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]); + } + } + + /** + * Simple non-optimized version that combines GLs from several pools and produces global AF distribution. + * @param GLs Inputs genotypes context with per-pool GLs + * @param numAlleles Number of alternate alleles + * @param ploidyPerPool Number of samples per pool + * @param log10AlleleFrequencyPriors Frequency priors + */ + protected void combineSinglePools(final GenotypesContext GLs, + final int numAlleles, + final int ploidyPerPool, + final double[] log10AlleleFrequencyPriors) { + + final ArrayList genotypeLikelihoods = getGLs(GLs, true); + + + int combinedPloidy = 0; + + // Combine each pool incrementally - likelihoods will be renormalized at each step + CombinedPoolLikelihoods combinedPoolLikelihoods = new CombinedPoolLikelihoods(); + + // first element: zero ploidy, e.g. trivial degenerate distribution + final int[] zeroCounts = new int[numAlleles]; + final ExactACset set = new ExactACset(1, new ExactACcounts(zeroCounts)); + set.getLog10Likelihoods()[0] = 0.0; + + combinedPoolLikelihoods.add(set); + + if ( genotypeLikelihoods.size() <= 1 ) { + // no meaningful GLs at all, just set the tracker to non poly values + getStateTracker().reset(); // just mimic-ing call below + getStateTracker().setLog10LikelihoodOfAFzero(0.0); + } else { + for (int p=1; p ACqueue = new LinkedList<>(); + // mapping of ExactACset indexes to the objects + final HashMap indexesToACset = new HashMap<>(); + final CombinedPoolLikelihoods newPool = new CombinedPoolLikelihoods(); + + // add AC=0 to the queue + final int[] zeroCounts = new int[numAlleles]; + final int newPloidy = originalPloidy + newGLPloidy; + zeroCounts[0] = newPloidy; + + ExactACset zeroSet = new ExactACset(1, new ExactACcounts(zeroCounts)); + + ACqueue.add(zeroSet); + indexesToACset.put(zeroSet.getACcounts(), zeroSet); + + // keep processing while we have AC conformations that need to be calculated + while ( !ACqueue.isEmpty() ) { + getStateTracker().incNEvaluations(); + // compute log10Likelihoods + final ExactACset ACset = ACqueue.remove(); + + calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, ACqueue, indexesToACset); + + // clean up memory + indexesToACset.remove(ACset.getACcounts()); + if ( VERBOSE ) + System.out.printf(" *** removing used set=%s%n", ACset.getACcounts()); + + } + return newPool; + } + + // todo - refactor, function almost identical except for log10LofK computation in GeneralPloidyGenotypeLikelihoods + /** + * + * @param set ExactACset holding conformation to be computed + * @param newPool New pool likelihood holder + * @param originalPool Original likelihood holder + * @param newGL New pool GL vector to combine + * @param log10AlleleFrequencyPriors Prior object + * @param originalPloidy Total ploidy of original combined pool + * @param newGLPloidy Ploidy of GL vector + * @param ACqueue Queue of conformations to compute + * @param indexesToACset AC indices of objects in queue + * @return max log likelihood + */ + private double calculateACConformationAndUpdateQueue(final ExactACset set, + final CombinedPoolLikelihoods newPool, + final CombinedPoolLikelihoods originalPool, + final double[] newGL, + final double[] log10AlleleFrequencyPriors, + final int originalPloidy, + final int newGLPloidy, + final LinkedList ACqueue, + final HashMap indexesToACset) { + + // compute likeihood in "set" of new set based on original likelihoods + final int numAlleles = set.getACcounts().getCounts().length; + final int newPloidy = set.getACsum(); + final double log10LofK = computeLofK(set, originalPool, newGL, log10AlleleFrequencyPriors, numAlleles, originalPloidy, newGLPloidy); + + + // add to new pool + if (!Double.isInfinite(log10LofK)) + newPool.add(set); + + // TODO -- change false to true this correct line when the implementation of this model is optimized (it's too slow now to handle this fix) + if ( getStateTracker().abort(log10LofK, set.getACcounts(), false) ) { + return log10LofK; + } + + // iterate over higher frequencies if possible + // by convention, ACcounts contained in set have full vector of possible pool ac counts including ref count. + // so, if first element is zero, it automatically means we have no wiggle since we're in a corner of the conformation space + final int ACwiggle = set.getACcounts().getCounts()[0]; + if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies + return log10LofK; + + + // add conformations for other cases + for ( int allele = 1; allele < numAlleles; allele++ ) { + final int[] ACcountsClone = set.getACcounts().getCounts().clone(); + ACcountsClone[allele]++; + // is this a valid conformation? + int altSum = (int)MathUtils.sum(ACcountsClone) - ACcountsClone[0]; + ACcountsClone[0] = newPloidy - altSum; + if (ACcountsClone[0] < 0) + continue; + + + GeneralPloidyGenotypeLikelihoods.updateACset(ACcountsClone, ACqueue, indexesToACset); + } + + + return log10LofK; + } + + +// /** +// * Naive combiner of two multiallelic pools - number of alt alleles must be the same. +// * Math is generalization of biallelic combiner. +// * +// * For vector K representing an allele count conformation, +// * Pr(D | AC = K) = Sum_G Pr(D|AC1 = G) Pr (D|AC2=K-G) * F(G,K) +// * where F(G,K) = choose(m1,[g0 g1 ...])*choose(m2,[...]) / choose(m1+m2,[k1 k2 ...]) +// * @param originalPool First log-likelihood pool GL vector +// * @param yy Second pool GL vector +// * @param ploidy1 Ploidy of first pool (# of chromosomes in it) +// * @param ploidy2 Ploidy of second pool +// * @param numAlleles Number of alleles +// * @param log10AlleleFrequencyPriors Array of biallelic priors +// * @param resultTracker Af calculation result object +// */ +// public static void combineMultiallelicPoolNaively(CombinedPoolLikelihoods originalPool, double[] yy, int ploidy1, int ploidy2, int numAlleles, +// final double[] log10AlleleFrequencyPriors, +// final AFCalcResultTracker resultTracker) { +///* +// final int dim1 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy1); +// final int dim2 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy2); +// +// if (dim1 != originalPool.getLength() || dim2 != yy.length) +// throw new ReviewedStingException("BUG: Inconsistent vector length"); +// +// if (ploidy2 == 0) +// return; +// +// final int newPloidy = ploidy1 + ploidy2; +// +// // Say L1(K) = Pr(D|AC1=K) * choose(m1,K) +// // and L2(K) = Pr(D|AC2=K) * choose(m2,K) +// GeneralPloidyGenotypeLikelihoods.SumIterator firstIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy1); +// final double[] x = originalPool.getLikelihoodsAsVector(true); +// while(firstIterator.hasNext()) { +// x[firstIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy1,firstIterator.getCurrentVector()); +// firstIterator.next(); +// } +// +// GeneralPloidyGenotypeLikelihoods.SumIterator secondIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2); +// final double[] y = yy.clone(); +// while(secondIterator.hasNext()) { +// y[secondIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy2,secondIterator.getCurrentVector()); +// secondIterator.next(); +// } +// +// // initialize output to -log10(choose(m1+m2,[k1 k2...]) +// final int outputDim = GenotypeLikelihoods.numLikelihoods(numAlleles, newPloidy); +// final GeneralPloidyGenotypeLikelihoods.SumIterator outputIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,newPloidy); +// +// +// // Now, result(K) = logSum_G (L1(G)+L2(K-G)) where G are all possible vectors that sum UP to K +// while(outputIterator.hasNext()) { +// final ExactACset set = new ExactACset(1, new ExactACcounts(outputIterator.getCurrentAltVector())); +// double likelihood = computeLofK(set, x,y, log10AlleleFrequencyPriors, numAlleles, ploidy1, ploidy2, result); +// +// originalPool.add(likelihood, set, outputIterator.getLinearIndex()); +// outputIterator.next(); +// } +//*/ +// } + + /** + * Compute likelihood of a particular AC conformation and update AFresult object + * @param set Set of AC counts to compute + * @param firstGLs Original pool likelihoods before combining + * @param secondGL New GL vector with additional pool + * @param log10AlleleFrequencyPriors Allele frequency priors + * @param numAlleles Number of alleles (including ref) + * @param ploidy1 Ploidy of original pool (combined) + * @param ploidy2 Ploidy of new pool + * @return log-likehood of requested conformation + */ + private double computeLofK(final ExactACset set, + final CombinedPoolLikelihoods firstGLs, + final double[] secondGL, + final double[] log10AlleleFrequencyPriors, + final int numAlleles, final int ploidy1, final int ploidy2) { + + final int newPloidy = ploidy1 + ploidy2; + + // sanity check + int totalAltK = set.getACsum(); + if (newPloidy != totalAltK) + throw new ReviewedStingException("BUG: inconsistent sizes of set.getACsum and passed ploidy values"); + + totalAltK -= set.getACcounts().getCounts()[0]; + // totalAltK has sum of alt alleles of conformation now + + + // special case for k = 0 over all k + if ( totalAltK == 0 ) { // all-ref case + final double log10Lof0 = firstGLs.getGLOfACZero() + secondGL[HOM_REF_INDEX]; + set.getLog10Likelihoods()[0] = log10Lof0; + + getStateTracker().setLog10LikelihoodOfAFzero(log10Lof0); + getStateTracker().setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); + return log10Lof0; + + } else { + + // initialize result with denominator + // ExactACset holds by convention the conformation of all alleles, and the sum of all allele count is just the ploidy. + // To compute n!/k1!k2!k3!... we need to compute first n!/(k2!k3!...) and then further divide by k1! where k1=ploidy-sum_k_i + + int[] currentCount = set.getACcounts().getCounts(); + double denom = -MathUtils.log10MultinomialCoefficient(newPloidy, currentCount); + + // for current conformation, get all possible ways to break vector K into two components G1 and G2 + final GeneralPloidyGenotypeLikelihoods.SumIterator innerIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2); + set.getLog10Likelihoods()[0] = Double.NEGATIVE_INFINITY; + while (innerIterator.hasNext()) { + // check if breaking current conformation into g1 and g2 is feasible. + final int[] acCount2 = innerIterator.getCurrentVector(); + final int[] acCount1 = MathUtils.vectorDiff(currentCount, acCount2); + final int idx2 = innerIterator.getLinearIndex(); + // see if conformation is valid and if original pool had this conformation + // for conformation to be valid, all elements of g2 have to be <= elements of current AC set + if (isValidConformation(acCount1,ploidy1) && firstGLs.hasConformation(acCount1)) { + final double gl2 = secondGL[idx2]; + if (!Double.isInfinite(gl2)) { + final double firstGL = firstGLs.getLikelihoodOfConformation(acCount1); + final double num1 = MathUtils.log10MultinomialCoefficient(ploidy1, acCount1); + final double num2 = MathUtils.log10MultinomialCoefficient(ploidy2, acCount2); + final double sum = firstGL + gl2 + num1 + num2; + + set.getLog10Likelihoods()[0] = MathUtils.approximateLog10SumLog10(set.getLog10Likelihoods()[0], sum); + } + } + innerIterator.next(); + } + + set.getLog10Likelihoods()[0] += denom; + } + + double log10LofK = set.getLog10Likelihoods()[0]; + + // update the MLE if necessary + final int altCounts[] = Arrays.copyOfRange(set.getACcounts().getCounts(),1, set.getACcounts().getCounts().length); + // TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY + getStateTracker().updateMLEifNeeded(Math.max(log10LofK, -Double.MAX_VALUE), altCounts); + + // apply the priors over each alternate allele + for (final int ACcount : altCounts ) { + if ( ACcount > 0 ) + log10LofK += log10AlleleFrequencyPriors[ACcount]; + } + // TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY + getStateTracker().updateMAPifNeeded(Math.max(log10LofK, -Double.MAX_VALUE), altCounts); + + return log10LofK; + } + + /** + * Small helper routine - is a particular AC conformationv vector valid? ie are all elements non-negative and sum to ploidy? + * @param set AC conformation vector + * @param ploidy Ploidy of set + * @return Valid conformation + */ + private static boolean isValidConformation(final int[] set, final int ploidy) { + int sum=0; + for (final int ac: set) { + if (ac < 0) + return false; + sum += ac; + + } + + return (sum == ploidy); + } + + /** + * From a given variant context, extract a given subset of alleles, and update genotype context accordingly, + * including updating the PL's, and assign genotypes accordingly + * @param vc variant context with alleles and genotype likelihoods + * @param allelesToUse alleles to subset + * @param assignGenotypes true: assign hard genotypes, false: leave as no-call + * @param ploidy number of chromosomes per sample (pool) + * @return GenotypesContext with new PLs + */ + public GenotypesContext subsetAlleles(final VariantContext vc, + final List allelesToUse, + final boolean assignGenotypes, + final int ploidy) { + // the genotypes with PLs + final GenotypesContext oldGTs = vc.getGenotypes(); + List NO_CALL_ALLELES = new ArrayList<>(ploidy); + + for (int k=0; k < ploidy; k++) + NO_CALL_ALLELES.add(Allele.NO_CALL); + + // samples + final List sampleIndices = oldGTs.getSampleNamesOrderedByName(); + + // the new genotypes to create + final GenotypesContext newGTs = GenotypesContext.create(); + + // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward + final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); + final int numNewAltAlleles = allelesToUse.size() - 1; + + + // create the new genotypes + for ( int k = 0; k < oldGTs.size(); k++ ) { + final Genotype g = oldGTs.get(sampleIndices.get(k)); + if ( !g.hasLikelihoods() ) { + newGTs.add(GenotypeBuilder.create(g.getSampleName(), NO_CALL_ALLELES)); + continue; + } + + // create the new likelihoods array from the alleles we are allowed to use + final double[] originalLikelihoods = g.getLikelihoods().getAsVector(); + double[] newLikelihoods; + + // Optimization: if # of new alt alleles = 0 (pure ref call), keep original likelihoods so we skip normalization + // and subsetting + if ( numOriginalAltAlleles == numNewAltAlleles || numNewAltAlleles == 0) { + newLikelihoods = originalLikelihoods; + } else { + newLikelihoods = GeneralPloidyGenotypeLikelihoods.subsetToAlleles(originalLikelihoods, ploidy, vc.getAlleles(), allelesToUse); + + // might need to re-normalize + newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true); + } + + // if there is no mass on the (new) likelihoods, then just no-call the sample + if ( MathUtils.sum(newLikelihoods) > GATKVariantContextUtils.SUM_GL_THRESH_NOCALL ) { + newGTs.add(GenotypeBuilder.create(g.getSampleName(), NO_CALL_ALLELES)); + } + else { + final GenotypeBuilder gb = new GenotypeBuilder(g); + + if ( numNewAltAlleles == 0 ) + gb.noPL(); + else + gb.PL(newLikelihoods); + + // if we weren't asked to assign a genotype, then just no-call the sample + if ( !assignGenotypes || MathUtils.sum(newLikelihoods) > GATKVariantContextUtils.SUM_GL_THRESH_NOCALL ) + gb.alleles(NO_CALL_ALLELES); + else + assignGenotype(gb, newLikelihoods, allelesToUse, ploidy); + newGTs.add(gb.make()); + } + } + + return newGTs; + + } + + /** + * Assign genotypes (GTs) to the samples in the Variant Context greedily based on the PLs + * + * @param newLikelihoods the PL array + * @param allelesToUse the list of alleles to choose from (corresponding to the PLs) + * @param numChromosomes Number of chromosomes per pool + */ + private void assignGenotype(final GenotypeBuilder gb, + final double[] newLikelihoods, + final List allelesToUse, + final int numChromosomes) { + final int numNewAltAlleles = allelesToUse.size() - 1; + + + + // find the genotype with maximum likelihoods + final int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods); + + final int[] mlAlleleCount = GeneralPloidyGenotypeLikelihoods.getAlleleCountFromPLIndex(allelesToUse.size(), numChromosomes, PLindex); + final ArrayList alleleFreqs = new ArrayList<>(); + final ArrayList alleleCounts = new ArrayList<>(); + + + for (int k=1; k < mlAlleleCount.length; k++) { + alleleCounts.add(mlAlleleCount[k]); + final double freq = (double)mlAlleleCount[k] / (double)numChromosomes; + alleleFreqs.add(freq); + + } + + // per-pool logging of AC and AF + gb.attribute(VCFConstants.MLE_PER_SAMPLE_ALLELE_COUNT_KEY, alleleCounts.size() == 1 ? alleleCounts.get(0) : alleleCounts); + gb.attribute(VCFConstants.MLE_PER_SAMPLE_ALLELE_FRACTION_KEY, alleleFreqs.size() == 1 ? alleleFreqs.get(0) : alleleFreqs); + + // remove PLs if necessary + if (newLikelihoods.length > MAX_LENGTH_FOR_POOL_PL_LOGGING) + gb.noPL(); + + ArrayList myAlleles = new ArrayList(); + + // add list of called ML genotypes to alleles list + // TODO - too unwieldy? + int idx = 0; + for (int mlind = 0; mlind < mlAlleleCount.length; mlind++) { + for (int k=0; k < mlAlleleCount[mlind]; k++) + myAlleles.add(idx++,allelesToUse.get(mlind)); + } + gb.alleles(myAlleles); + + // TODO - deprecated so what is the appropriate method to call? + if ( numNewAltAlleles > 0 ) + gb.log10PError(GenotypeLikelihoods.getGQLog10FromLikelihoods(PLindex, newLikelihoods)); + } + +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java new file mode 100644 index 000000000..ea09f52e8 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java @@ -0,0 +1,496 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.variant.variantcontext.*; + +import java.util.*; + +/** + * Computes the conditional bi-allelic exact results + * + * Suppose vc contains 2 alt allele: A* with C and T. This function first computes: + * + * (1) P(D | AF_c > 0 && AF_t == *) [i.e., T can be anything] + * + * it then computes the conditional probability on AF_c == 0: + * + * (2) P(D | AF_t > 0 && AF_c == 0) + * + * Thinking about this visually, we have the following likelihood matrix where each cell is + * the P(D | AF_c == i && AF_t == j): + * + * 0 AF_c > 0 + * ----------------- + * 0 | | + * |--|------------- + * a | | + * f | | + * _ | | + * t | | + * > | | + * 0 | | + * + * What we really want to know how + * + * (3) P(D | AF_c == 0 & AF_t == 0) + * + * compares with + * + * (4) P(D | AF_c > 0 || AF_t > 0) + * + * This is effectively asking for the value in the upper left vs. the sum of all cells. + * + * This class implements the conditional likelihoods summation for any number of alt + * alleles, where each alt allele has its EXACT probability of segregating calculated by + * reducing each alt B into the case XB and computing P(D | AF_b > 0 ) as follows: + * + * Suppose we have for a A/B/C site the following GLs: + * + * AA AB BB AC BC CC + * + * and we want to get the bi-allelic GLs for X/B, where X is everything not B + * + * XX = AA + AC + CC (since X = A or C) + * XB = AB + BC + * BB = BB + * + * After each allele has its probability calculated we compute the joint posterior + * as P(D | AF_* == 0) = prod_i P (D | AF_i == 0), after applying the theta^i + * prior for the ith least likely allele. + */ + public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc { + + /** + * The min. confidence of an allele to be included in the joint posterior. + */ + private final static double MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR = Math.log10(1e-10); + + private final static int[] BIALLELIC_NON_INFORMATIVE_PLS = new int[]{0,0,0}; + private final static List BIALLELIC_NOCALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + + /** + * Sorts AFCalcResults by their posteriors of AF > 0, so the + */ + private final static class CompareAFCalcResultsByPNonRef implements Comparator { + @Override + public int compare(AFCalcResult o1, AFCalcResult o2) { + return -1 * Double.compare(o1.getLog10PosteriorOfAFGT0(), o2.getLog10PosteriorOfAFGT0()); + } + } + + private final static CompareAFCalcResultsByPNonRef compareAFCalcResultsByPNonRef = new CompareAFCalcResultsByPNonRef(); + + /** + * The AFCalc model we are using to do the bi-allelic computation + */ + final AFCalc biAlleleExactModel; + + protected IndependentAllelesDiploidExactAFCalc(int nSamples, int maxAltAlleles, final int ploidy) { + super(nSamples, maxAltAlleles, ploidy); + biAlleleExactModel = new ReferenceDiploidExactAFCalc(nSamples, 1, ploidy); + } + + /** + * Trivial subclass that helps with debugging by keeping track of the supporting information for this joint call + */ + private static class MyAFCalcResult extends AFCalcResult { + /** + * List of the supporting bi-allelic AFCalcResults that went into making this multi-allelic joint call + */ + final List supporting; + + private MyAFCalcResult(int[] alleleCountsOfMLE, int nEvaluations, List allelesUsedInGenotyping, double[] log10LikelihoodsOfAC, double[] log10PriorsOfAC, Map log10pRefByAllele, List supporting) { + super(alleleCountsOfMLE, nEvaluations, allelesUsedInGenotyping, log10LikelihoodsOfAC, log10PriorsOfAC, log10pRefByAllele); + this.supporting = supporting; + } + } + + @Override + public AFCalcResult computeLog10PNonRef(final VariantContext vc, + final double[] log10AlleleFrequencyPriors) { + final List independentResultTrackers = computeAlleleIndependentExact(vc, log10AlleleFrequencyPriors); + + if ( independentResultTrackers.size() == 0 ) + throw new IllegalStateException("Independent alleles model returned an empty list of results at VC " + vc); + + if ( independentResultTrackers.size() == 1 ) { + // fast path for the very common bi-allelic use case + return independentResultTrackers.get(0); + } else { + // we are a multi-allelic, so we need to actually combine the results + final List withMultiAllelicPriors = applyMultiAllelicPriors(independentResultTrackers); + return combineIndependentPNonRefs(vc, withMultiAllelicPriors); + } + } + + /** + * Compute the conditional exact AFCalcResult for each allele in vc independently, returning + * the result of each, in order of the alt alleles in VC + * + * @param vc the VariantContext we want to analyze, with at least 1 alt allele + * @param log10AlleleFrequencyPriors the priors + * @return a list of the AFCalcResults for each bi-allelic sub context of vc + */ + @Requires({"vc != null", "log10AlleleFrequencyPriors != null"}) + @Ensures("goodIndependentResult(vc, result)") + protected final List computeAlleleIndependentExact(final VariantContext vc, + final double[] log10AlleleFrequencyPriors) { + final List results = new LinkedList(); + + for ( final VariantContext subvc : makeAlleleConditionalContexts(vc) ) { + final AFCalcResult resultTracker = biAlleleExactModel.getLog10PNonRef(subvc, log10AlleleFrequencyPriors); + results.add(resultTracker); + } + + return results; + } + + /** + * Helper function to ensure that the computeAlleleIndependentExact is returning reasonable results + */ + private static boolean goodIndependentResult(final VariantContext vc, final List results) { + if ( results.size() != vc.getNAlleles() - 1) return false; + for ( int i = 0; i < results.size(); i++ ) { + if ( results.get(i).getAllelesUsedInGenotyping().size() != 2 ) + return false; + if ( ! results.get(i).getAllelesUsedInGenotyping().contains(vc.getAlternateAllele(i)) ) + return false; + } + + return true; + } + + /** + * Returns the bi-allelic variant context for each alt allele in vc with bi-allelic likelihoods, in order + * + * @param vc the variant context to split. Must have n.alt.alleles > 1 + * @return a bi-allelic variant context for each alt allele in vc + */ + @Requires({"vc != null", "vc.getNAlleles() > 1"}) + @Ensures("result.size() == vc.getNAlleles() - 1") + protected final List makeAlleleConditionalContexts(final VariantContext vc) { + final int nAltAlleles = vc.getNAlleles() - 1; + + if ( nAltAlleles == 1 ) { + // fast path for bi-allelic case. + return Collections.singletonList(vc); + } else { + // go through the work of ripping up the VC into its biallelic components + final List vcs = new LinkedList(); + + for ( int altI = 0; altI < nAltAlleles; altI++ ) { + vcs.add(biallelicCombinedGLs(vc, altI + 1)); + } + + return vcs; + } + } + + /** + * Create a single bi-allelic variant context from rootVC with alt allele with index altAlleleIndex + * + * @param rootVC the root (potentially multi-allelic) variant context + * @param altAlleleIndex index of the alt allele, from 0 == first alt allele + * @return a bi-allelic variant context based on rootVC + */ + @Requires({"rootVC.getNAlleles() > 1", "altAlleleIndex < rootVC.getNAlleles()"}) + @Ensures({"result.isBiallelic()"}) + protected final VariantContext biallelicCombinedGLs(final VariantContext rootVC, final int altAlleleIndex) { + if ( rootVC.isBiallelic() ) { + return rootVC; + } else { + final int nAlts = rootVC.getNAlleles() - 1; + final List biallelicGenotypes = new ArrayList(rootVC.getNSamples()); + for ( final Genotype g : rootVC.getGenotypes() ) + biallelicGenotypes.add(combineGLsPrecise(g, altAlleleIndex, nAlts)); + + final VariantContextBuilder vcb = new VariantContextBuilder(rootVC); + final Allele altAllele = rootVC.getAlternateAllele(altAlleleIndex - 1); + vcb.alleles(Arrays.asList(rootVC.getReference(), altAllele)); + vcb.genotypes(biallelicGenotypes); + return vcb.make(); + } + } + + /** + * Returns a new Genotype with the PLs of the multi-allelic original reduced to a bi-allelic case + * + * This is handled in the following way: + * + * Suppose we have for a A/B/C site the following GLs: + * + * AA AB BB AC BC CC + * + * and we want to get the bi-allelic GLs for X/B, where X is everything not B + * + * XX = AA + AC + CC (since X = A or C) + * XB = AB + BC + * BB = BB + * + * @param original the original multi-allelic genotype + * @param altIndex the index of the alt allele we wish to keep in the bialleic case -- with ref == 0 + * @param nAlts the total number of alt alleles + * @return a new biallelic genotype with appropriate PLs + */ + @Requires({"original.hasLikelihoods()"}) // TODO -- add ploidy == 2 test "original.getPLs() == null || original.getPLs().length == 3"}) + @Ensures({"result.hasLikelihoods()", "result.getPL().length == 3"}) + @Deprecated + protected Genotype combineGLs(final Genotype original, final int altIndex, final int nAlts ) { + if ( original.isNonInformative() ) + return new GenotypeBuilder(original).PL(BIALLELIC_NON_INFORMATIVE_PLS).alleles(BIALLELIC_NOCALL).make(); + + if ( altIndex < 1 || altIndex > nAlts ) throw new IllegalStateException("altIndex must be between 1 and nAlts " + nAlts); + + final double[] normalizedPr = MathUtils.normalizeFromLog10(GenotypeLikelihoods.fromPLs(original.getPL()).getAsVector()); + final double[] biAllelicPr = new double[3]; + + for ( int index = 0; index < normalizedPr.length; index++ ) { + final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair = GenotypeLikelihoods.getAllelePair(index); + + if ( pair.alleleIndex1 == altIndex ) { + if ( pair.alleleIndex2 == altIndex ) + // hom-alt case + biAllelicPr[2] = normalizedPr[index]; + else + // het-alt case + biAllelicPr[1] += normalizedPr[index]; + } else { + if ( pair.alleleIndex2 == altIndex ) + // het-alt case + biAllelicPr[1] += normalizedPr[index]; + else + // hom-non-alt + biAllelicPr[0] += normalizedPr[index]; + } + } + + final double[] GLs = new double[3]; + for ( int i = 0; i < GLs.length; i++ ) GLs[i] = Math.log10(biAllelicPr[i]); + + return new GenotypeBuilder(original).PL(GLs).alleles(BIALLELIC_NOCALL).make(); + } + + + private static final double PHRED_2_LOG10_COEFF = -.1; + + /** + * Returns a new Genotype with the PLs of the multi-allelic original reduced to a bi-allelic case. + * + *

Uses the log-sum-exp trick in order to work well with very low PLs

+ * + *

This is handled in the following way:

+ * + *

Suppose we have for a A/B/C site the following GLs:

+ * + *

AA AB BB AC BC CC

+ * + *

and we want to get the bi-allelic GLs for X/B, where X is everything not B

+ * + *

XX = AA + AC + CC (since X = A or C)
+ * XB = AB + BC
+ * BB = BB
+ *

+ *

+ * This implementation use the log sum trick in order to avoid numeric inestability. + *

+ * + * @param original the original multi-allelic genotype + * @param altIndex the index of the alt allele we wish to keep in the bialleic case -- with ref == 0 + * @param nAlts the total number of alt alleles + * @return a new biallelic genotype with appropriate PLs + */ + @Requires({"original.hasLikelihoods()"}) + @Ensures({"result.hasLikelihoods()", "result.getPL().length == 3"}) + protected Genotype combineGLsPrecise(final Genotype original, final int altIndex, final int nAlts ) { + + if ( original.isNonInformative() ) + return new GenotypeBuilder(original).PL(BIALLELIC_NON_INFORMATIVE_PLS).alleles(BIALLELIC_NOCALL).make(); + + if ( altIndex < 1 || altIndex > nAlts ) throw new IllegalStateException("altIndex must be between 1 and nAlts " + nAlts); + + final int[] pls = original.getPL(); + + final int nAlleles = nAlts + 1; + + final int plCount = pls.length; + + double BB = 0; + final double[] XBvalues = new double[nAlleles - 1]; + final double[] XXvalues = new double[plCount - nAlleles]; + + int xbOffset = 0; + int xxOffset = 0; + for ( int index = 0; index < plCount; index++ ) { + final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair = GenotypeLikelihoods.getAllelePair(index); + int i = pair.alleleIndex1; + int j = pair.alleleIndex2; + if (i == j) { + if (i == altIndex) BB = PHRED_2_LOG10_COEFF * pls[index]; else XXvalues[xxOffset++] = PHRED_2_LOG10_COEFF * pls[index]; + } else if (i == altIndex || j == altIndex) + XBvalues[xbOffset++] = PHRED_2_LOG10_COEFF * pls[index]; + else + XXvalues[xxOffset++] = PHRED_2_LOG10_COEFF * pls[index]; + } + + final double XB = MathUtils.log10sumLog10(XBvalues); + final double XX = MathUtils.log10sumLog10(XXvalues); + + final double[] GLs = new double[] { XX, XB, BB}; + return new GenotypeBuilder(original).PL(GLs).alleles(BIALLELIC_NOCALL).make(); + } + + protected final List applyMultiAllelicPriors(final List conditionalPNonRefResults) { + final ArrayList sorted = new ArrayList(conditionalPNonRefResults); + + // sort the results, so the most likely allele is first + Collections.sort(sorted, compareAFCalcResultsByPNonRef); + + double lastPosteriorGt0 = sorted.get(0).getLog10PosteriorOfAFGT0(); + final double log10SingleAllelePriorOfAFGt0 = conditionalPNonRefResults.get(0).getLog10PriorOfAFGT0(); + + for ( int i = 0; i < sorted.size(); i++ ) { + if ( sorted.get(i).getLog10PosteriorOfAFGT0() > lastPosteriorGt0 ) + throw new IllegalStateException("pNonRefResults not sorted: lastPosteriorGt0 " + lastPosteriorGt0 + " but current is " + sorted.get(i).getLog10PosteriorOfAFGT0()); + + final double log10PriorAFGt0 = (i + 1) * log10SingleAllelePriorOfAFGt0; + final double log10PriorAFEq0 = Math.log10(1 - Math.pow(10, log10PriorAFGt0)); + final double[] thetaTONPriors = new double[] { log10PriorAFEq0, log10PriorAFGt0 }; + + // bind pNonRef for allele to the posterior value of the AF > 0 with the new adjusted prior + sorted.set(i, sorted.get(i).withNewPriors(MathUtils.normalizeFromLog10(thetaTONPriors, true))); + } + + return sorted; + } + + /** + * Take the independent estimates of pNonRef for each alt allele and combine them into a single result + * + * Given n independent calculations for each of n alternate alleles create a single + * combined AFCalcResult with: + * + * priors for AF == 0 equal to theta^N for the nth least likely allele + * posteriors that reflect the combined chance that any alleles are segregating and corresponding + * likelihoods + * combined MLEs in the order of the alt alleles in vc + * + * @param sortedResultsWithThetaNPriors the pNonRef result for each allele independently + */ + protected AFCalcResult combineIndependentPNonRefs(final VariantContext vc, + final List sortedResultsWithThetaNPriors) { + int nEvaluations = 0; + final int nAltAlleles = sortedResultsWithThetaNPriors.size(); + final int[] alleleCountsOfMLE = new int[nAltAlleles]; + final double[] log10PriorsOfAC = new double[2]; + final Map log10pRefByAllele = new HashMap(nAltAlleles); + + // the sum of the log10 posteriors for AF == 0 and AF > 0 to determine joint probs + double log10PosteriorOfACEq0Sum = 0.0; + double log10PosteriorOfACGt0Sum = 0.0; + + boolean anyPoly = false; + for ( final AFCalcResult sortedResultWithThetaNPriors : sortedResultsWithThetaNPriors ) { + final Allele altAllele = sortedResultWithThetaNPriors.getAllelesUsedInGenotyping().get(1); + final int altI = vc.getAlleles().indexOf(altAllele) - 1; + + // MLE of altI allele is simply the MLE of this allele in altAlleles + alleleCountsOfMLE[altI] = sortedResultWithThetaNPriors.getAlleleCountAtMLE(altAllele); + + // the AF > 0 case requires us to store the normalized likelihood for later summation + if ( sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0() > MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR ) { + anyPoly = true; + log10PosteriorOfACEq0Sum += sortedResultWithThetaNPriors.getLog10PosteriorOfAFEq0(); + log10PriorsOfAC[0] += sortedResultWithThetaNPriors.getLog10PriorOfAFEq0(); + log10PriorsOfAC[1] += sortedResultWithThetaNPriors.getLog10PriorOfAFGT0(); + } + + log10PosteriorOfACGt0Sum += sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0(); + + // bind pNonRef for allele to the posterior value of the AF > 0 with the new adjusted prior + log10pRefByAllele.put(altAllele, sortedResultWithThetaNPriors.getLog10PosteriorOfAFEq0()); + + // trivial -- update the number of evaluations + nEvaluations += sortedResultWithThetaNPriors.nEvaluations; + } + + // If no alleles were polymorphic, make sure we have the proper priors (the defaults) for likelihood calculation + if ( ! anyPoly ) { + log10PriorsOfAC[0] = sortedResultsWithThetaNPriors.get(0).getLog10PriorOfAFEq0(); + log10PriorsOfAC[1] = sortedResultsWithThetaNPriors.get(0).getLog10PriorOfAFGT0(); + } + + // In principle, if B_p = x and C_p = y are the probabilities of being poly for alleles B and C, + // the probability of being poly is (1 - B_p) * (1 - C_p) = (1 - x) * (1 - y). We want to estimate confidently + // log10((1 - x) * (1 - y)) which is log10(1 - x) + log10(1 - y). This sum is log10PosteriorOfACEq0 + // + // note we need to handle the case where the posterior of AF == 0 is 0.0, in which case we + // use the summed log10PosteriorOfACGt0Sum directly. This happens in cases where + // AF > 0 : 0.0 and AF == 0 : -16, and if you use the inverse calculation you get 0.0 and MathUtils.LOG10_P_OF_ZERO + final double log10PosteriorOfACGt0; + if ( log10PosteriorOfACEq0Sum == 0.0 ) + log10PosteriorOfACGt0 = log10PosteriorOfACGt0Sum; + else + log10PosteriorOfACGt0 = Math.max(Math.log10(1 - Math.pow(10, log10PosteriorOfACEq0Sum)), MathUtils.LOG10_P_OF_ZERO); + + final double[] log10LikelihoodsOfAC = new double[] { + // L + prior = posterior => L = poster - prior + log10PosteriorOfACEq0Sum - log10PriorsOfAC[0], + log10PosteriorOfACGt0 - log10PriorsOfAC[1] + }; + + return new MyAFCalcResult(alleleCountsOfMLE, nEvaluations, vc.getAlleles(), + // necessary to ensure all values < 0 + MathUtils.normalizeFromLog10(log10LikelihoodsOfAC, true), + // priors incorporate multiple alt alleles, must be normalized + MathUtils.normalizeFromLog10(log10PriorsOfAC, true), + log10pRefByAllele, sortedResultsWithThetaNPriors); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ReferenceDiploidExactAFCalc.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ReferenceDiploidExactAFCalc.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ReferenceDiploidExactAFCalc.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ReferenceDiploidExactAFCalc.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTrimmer.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTrimmer.java new file mode 100644 index 000000000..b7a646d4e --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTrimmer.java @@ -0,0 +1,538 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Hidden; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.variant.variantcontext.VariantContext; + +import java.util.*; + +/** + * Helper component to manage active region trimming + * + *

+ * It receives the user arguments that controls trimming and also performs the trimming region calculation. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +class ActiveRegionTrimmer { + + /** + * Genome location parser use in order to create and manipulate genomic intervals. + */ + private GenomeLocParser locParser; + + /** + * Holds the debug flag. If {@code true} the trimmer will output debugging messages into the log. + */ + private boolean debug; + + /** + * Holds the extension to be used based on whether GGA mode is on or off. + */ + private int usableExtension; + + /** + * Records whether the trimming intervals are going to be used to emit reference confidence, {@code true}, + * or regular HC output {@code false}. + */ + private boolean emitReferenceConfidence; + + @Hidden + @Argument(fullName="dontTrimActiveRegions", shortName="dontTrimActiveRegions", doc="If specified, we will not trim down the active region from the full region (active + extension) to just the active interval for genotyping", required = false) + protected boolean dontTrimActiveRegions = false; + + /** + * the maximum extent into the full active region extension that we're willing to go in genotyping our events + */ + @Hidden + @Argument(fullName="maxDiscARExtension", shortName="maxDiscARExtension", doc = "the maximum extent into the full active region extension that we're willing to go in genotyping our events for discovery", required=false) + protected int discoverExtension = 25; + + @Hidden + @Argument(fullName="maxGGAARExtension", shortName="maxGGAARExtension", doc = "the maximum extent into the full active region extension that we're willing to go in genotyping our events for GGA mode", required=false) + protected int ggaExtension = 300; + + /** + * Include at least this many bases around an event for calling it + */ + @Hidden + @Argument(fullName="paddingAroundIndels", shortName="paddingAroundIndels", doc = "Include at least this many bases around an event for calling indels", required=false) + protected int indelPadding = 150; + + @Hidden + @Argument(fullName="paddingAroundSNPs", shortName="paddingAroundSNPs", doc = "Include at least this many bases around an event for calling snps", required=false) + protected int snpPadding = 20; + + /** + * Holds a reference the trimmer logger. + */ + private final static Logger logger = Logger.getLogger(ActiveRegionTrimmer.class); + + /** + * Initializes the trimmer. + * + *

+ * This method should be called once and only once before any trimming is performed. + * + * + * @param glp the genome-location-parser to be used when operating with genomic locations. + * @param debug whether to show extra debug log messages. + * @param isGGA whether the trimming region calculator should act as if we are in GGA mode or not. + * @param emitReferenceConfidence indicates whether we plan to use this trimmer to generate trimmed regions + * to be used for emitting reference confidence. + * + * @throws IllegalStateException if this trim calculator has already been initialized. + * @throws IllegalArgumentException if the input location parser is {@code null}. + * @throws UserException.BadArgumentValue if any of the user argument values is invalid. + */ + void initialize(final GenomeLocParser glp, final boolean debug, final boolean isGGA, final boolean emitReferenceConfidence) { + if (locParser != null) + throw new IllegalStateException(getClass().getSimpleName() + " instance initialized twice"); + if (glp == null) + throw new IllegalArgumentException("input genome-loc-parser cannot be null"); + checkUserArguments(); + locParser = glp; + this.debug = debug; + usableExtension = isGGA ? ggaExtension : discoverExtension; + this.emitReferenceConfidence = emitReferenceConfidence; + } + + /** + * Checks user trimming argument values + * + * @throws UserException.BadArgumentValue if there is some problem with any of the arguments values. + */ + private void checkUserArguments() { + if ( snpPadding < 0 ) throw new UserException.BadArgumentValue("paddingAroundSNPs","" + snpPadding + "< 0"); + if ( indelPadding < 0 ) throw new UserException.BadArgumentValue("paddingAroundIndels","" + indelPadding + "< 0"); + if ( discoverExtension < 0) throw new UserException.BadArgumentValue("maxDiscARExtension","" + discoverExtension + "< 0"); + if ( ggaExtension < 0) throw new UserException.BadArgumentValue("maxGGAAREExtension","" + ggaExtension + "< 0"); + } + + /** + * Holds the result of trimming. + * + * + * + */ + public static class Result { + + /** + * Indicates whether trimming is required per data and user request. + */ + protected final boolean needsTrimming; + + /** + * Holds the input active region. + */ + protected final ActiveRegion originalRegion; + + /** + * Holds the smaller range that contain all relevant callable variants in the + * input active region (not considering the extension). + * + */ + protected final GenomeLoc callableSpan; + + /** + * Maximum available range for the trimmed variant region. + */ + protected final GenomeLoc maximumSpan; + + /** + * The trimmed variant region span including the extension. + */ + protected final GenomeLoc extendedSpan; + + + /** + * The ideal trimmer variant region span including the extension. + */ + protected final GenomeLoc idealSpan; + + /** + * Returns the ideal trimming span. + * + *

+ * The ideal span is the one containing all callable variation overlapping the original active region span + * (without extension) and the applicable padding {@link #getPadding()} in both sides. + * + * + * @return never {@code null}. + */ + @SuppressWarnings("unused") + public GenomeLoc getIdealSpan() { + return idealSpan; + } + + /** + * Holds the flanking spans that do not contain the callable variants. + *

+ * The first element of the pair is the left (up-stream) non-variant flank, whereas the second element is + * the right (down-stream) non-variant flank. + */ + protected final Pair nonVariantFlanks; + + /** + * Holds the collection of callable events within the variant trimming region. + */ + protected final List callableEvents; + + /** + * Required padding around the variant trimming region. + */ + protected final int padding; + + + /** + * Returns the required padding around callable variation. + * + *

+ * Notice that due to the limiting span of the original active region (including its extension) it + * is possible that the resulting final trimmed variant region span does not satisfies the padding. However + * that should be rare. + * + * @return 0 or greater. + */ + @SuppressWarnings("unused") + public int getPadding() { + return padding; + } + + /** + * Holds the maximum extension around the original active region span considered for the trimmed + * variation region. + */ + protected final int usableExtension; + + /** + * Returns the maximum extension around the original active region span considered for the trimmed + * variation region. + * + *

+ * From time to time, the trimmed region may require a span beyond the input original active region's. + * For example when there is a callable event close ot one of its ends and the required padding makes it + * round beyond that limit. + * + *

+ * Notice that due to the limiting span of the original active region (including its extended region) it + * is possible that the resulting final trimmed variant region span goes beyond this extension including more of + * the original active region own extension. + * + * @return 0 or greater. + */ + @SuppressWarnings("unused") + public int getUsableExtension() { + return usableExtension; + } + + /** + * Holds variant-containing callable region. + *

+ * This is lazy-initialized using {@link #callableSpan}. + */ + protected ActiveRegion callableRegion; + + + /** + * Non-variant left flank region. + *

+ * This is lazy-initialized using + * {@link #nonVariantFlanks}.{@link Pair#getFirst() getFirst()}. + */ + private ActiveRegion leftFlankRegion; + + /** + * Non-variant right flank region. + *

+ * This is lazy-initialized using + * {@link #nonVariantFlanks}.{@link Pair#getFirst() getSecond()}. + */ + private ActiveRegion rightFlankRegion; + + /** + * Whether the variant trimmed region is going to be used for emitting reference confidence records. + */ + private final boolean emitReferenceConfidence; + + /** + * Creates a trimming result given all its properties. + * + * @param emitReferenceConfidence whether reference confidence output modes are on. + * @param needsTrimming whether there is any trimming needed at all. + * @param originalRegion the original active region. + * @param padding padding around contained callable variation events. + * @param extension the extension applied to the trimmed variant span. + * @param overlappingEvents contained callable variation events. + * @param nonVariantFlanks pair of non-variant flank spans around the variant containing span. + * @param extendedSpan final trimmed variant span including the extension. + * @param idealSpan the ideal span, that contains. + * @param maximumSpan maximum possible trimmed span based on the input original active region extended span. + * @param callableSpan variant containing span without padding. + */ + protected Result(final boolean emitReferenceConfidence, final boolean needsTrimming, final ActiveRegion originalRegion, + final int padding, final int extension, + final List overlappingEvents, final Pair nonVariantFlanks, + final GenomeLoc extendedSpan, + final GenomeLoc idealSpan, + final GenomeLoc maximumSpan, + final GenomeLoc callableSpan) { + this.emitReferenceConfidence = emitReferenceConfidence; + this.needsTrimming = needsTrimming; + this.originalRegion = originalRegion; + this.nonVariantFlanks = nonVariantFlanks; + this.padding = padding; + this.usableExtension = extension; + this.callableEvents = overlappingEvents; + this.callableSpan = callableSpan; + this.idealSpan = idealSpan; + this.maximumSpan = maximumSpan; + this.extendedSpan = extendedSpan; + + if (!extendedSpan.isUnmapped() && !callableSpan.isUnmapped() && !extendedSpan.containsP(callableSpan)) + throw new IllegalArgumentException("the extended callable span must include the callable span"); + } + + + /** + * Checks whether there is any variation present in the target region. + * + * @return {@code true} if there is any variant, {@code false} otherwise. + */ + public boolean isVariationPresent() { + return ! callableEvents.isEmpty(); + } + + /** + * Checks whether the active region needs trimming. + */ + public boolean needsTrimming() { + return needsTrimming; + } + + /** + * Returns the trimmed variant containing region + * + * @throws IllegalStateException if there is no variation detected. + * + * @return never {@code null}. + */ + public ActiveRegion getCallableRegion() { + if (callableRegion == null && !extendedSpan.isUnmapped()) + //TODO this conditional is a patch to retain the current standard HC run behaviour + //TODO we should simply remove this difference between trimming with or without GVCF + //TODO embracing slight changes in the standard HC output + callableRegion = emitReferenceConfidence ? originalRegion.trim(callableSpan, extendedSpan) : originalRegion.trim(extendedSpan); + else if (extendedSpan.isUnmapped()) + throw new IllegalStateException("there is no variation thus no variant region"); + return callableRegion; + } + + /** + * Checks whether there is a non-empty left flanking non-variant trimmed out region. + * @return {@code true} if there is a non-trivial left flank region, {@code false} otherwise. + */ + public boolean hasLeftFlankingRegion() { + return ! nonVariantFlanks.getFirst().isUnmapped(); + } + + /** + * Checks whether there is a non-empty right flanking non-variant trimmed out region. + * @return {@code true} if there is a non-trivial right flank region, {@code false} otherwise. + */ + public boolean hasRightFlankingRegion() { + return ! nonVariantFlanks.getSecond().isUnmapped(); + } + + /** + * Returns the trimmed out left non-variant region. + *

+ * Notice that in case of no variation, the whole original region is considered the left flanking region. + * + * @throws IllegalStateException if there is not such as left flanking region. + */ + public ActiveRegion nonVariantLeftFlankRegion() { + if (leftFlankRegion == null && ! nonVariantFlanks.getFirst().isUnmapped()) + leftFlankRegion = originalRegion.trim(nonVariantFlanks.getFirst(),originalRegion.getExtension()); + else if (nonVariantFlanks.getFirst().isUnmapped()) + throw new IllegalStateException("there is no left flank non-variant trimmed out region"); + return leftFlankRegion; + } + + /** + * Returns the trimmed out right non-variant region. + */ + public ActiveRegion nonVariantRightFlankRegion() { + if (rightFlankRegion == null && ! nonVariantFlanks.getSecond().isUnmapped()) + rightFlankRegion = originalRegion.trim(nonVariantFlanks.getSecond(),originalRegion.getExtension()); + else if (nonVariantFlanks.getSecond().isUnmapped()) + throw new IllegalStateException("there is no right flank non-variant trimmed out region"); + return rightFlankRegion; + } + + /** + * Creates a result indicating that there was no trimming to be done. + */ + protected static Result noTrimming(final boolean emitReferenceConfidence, + final ActiveRegion targetRegion, final int padding, + final int usableExtension,final List events) { + final GenomeLoc targetRegionLoc = targetRegion.getLocation(); + final Result result = new Result(emitReferenceConfidence,false,targetRegion,padding,usableExtension,events,new Pair<>(GenomeLoc.UNMAPPED,GenomeLoc.UNMAPPED), + targetRegionLoc,targetRegionLoc,targetRegionLoc,targetRegionLoc); + result.callableRegion = targetRegion; + return result; + } + + /** + * Creates a result indicating that no variation was found. + */ + protected static Result noVariation(final boolean emitReferenceConfidence, final ActiveRegion targetRegion, + final int padding, final int usableExtension) { + final Result result = new Result(emitReferenceConfidence,false,targetRegion,padding,usableExtension, + Collections.emptyList(),new Pair<>(targetRegion.getLocation(),GenomeLoc.UNMAPPED), + GenomeLoc.UNMAPPED,GenomeLoc.UNMAPPED,GenomeLoc.UNMAPPED,GenomeLoc.UNMAPPED); + result.leftFlankRegion = targetRegion; + return result; + } + } + + /** + * Returns a trimming result object from which the variant trimmed region and flanking non-variant sections + * can be recovered latter. + * + * @param originalRegion the genome location range to trim. + * @param allVariantsWithinExtendedRegion list of variants contained in the trimming location. Variants therein + * not overlapping with {@code originalRegion} are simply ignored. + * @return never {@code null}. + */ + public Result trim(final ActiveRegion originalRegion, + final TreeSet allVariantsWithinExtendedRegion) { + + + if ( allVariantsWithinExtendedRegion.isEmpty() ) // no variants, + return Result.noVariation(emitReferenceConfidence,originalRegion,snpPadding, usableExtension); + + final List withinActiveRegion = new LinkedList<>(); + final GenomeLoc originalRegionRange = originalRegion.getLocation(); + boolean foundNonSnp = false; + GenomeLoc variantSpan = null; + for ( final VariantContext vc : allVariantsWithinExtendedRegion ) { + final GenomeLoc vcLoc = locParser.createGenomeLoc(vc); + if ( originalRegionRange.overlapsP(vcLoc) ) { + foundNonSnp = foundNonSnp || ! vc.isSNP(); + variantSpan = variantSpan == null ? vcLoc : variantSpan.endpointSpan(vcLoc); + withinActiveRegion.add(vc); + } + } + final int padding = foundNonSnp ? indelPadding : snpPadding; + + // we don't actually have anything in the region after skipping out variants that don't overlap + // the region's full location + if ( variantSpan == null ) + return Result.noVariation(emitReferenceConfidence,originalRegion,padding, usableExtension); + + if ( dontTrimActiveRegions) + return Result.noTrimming(emitReferenceConfidence,originalRegion, padding, usableExtension, withinActiveRegion); + + final GenomeLoc maximumSpan = locParser.createPaddedGenomeLoc(originalRegionRange, usableExtension); + final GenomeLoc idealSpan = locParser.createPaddedGenomeLoc(variantSpan, padding); + final GenomeLoc finalSpan = maximumSpan.intersect(idealSpan).union(variantSpan); + + // Make double sure that, if we are emitting GVCF we won't call non-variable positions beyond the target active region span. + // In regular call we don't do so so we don't care and we want to maintain behavior, so the conditional. + final GenomeLoc callableSpan = emitReferenceConfidence ? variantSpan.intersect(originalRegionRange) : variantSpan; + + final Pair nonVariantRegions = nonVariantTargetRegions(originalRegion, callableSpan); + + if ( debug ) { + logger.info("events : " + withinActiveRegion); + logger.info("region : " + originalRegion); + logger.info("callableSpan : " + callableSpan); + logger.info("padding : " + padding); + logger.info("idealSpan : " + idealSpan); + logger.info("maximumSpan : " + maximumSpan); + logger.info("finalSpan : " + finalSpan); + } + + return new Result(emitReferenceConfidence,true,originalRegion,padding, usableExtension,withinActiveRegion,nonVariantRegions,finalSpan,idealSpan,maximumSpan,variantSpan); + } + + /** + * Calculates the list of region to trim away. + * @param targetRegion region for which to generate the flanking regions. + * @param variantSpan the span of the core region containing relevant variation and required padding. + * @return never {@code null}; 0, 1 or 2 element list. + */ + private Pair nonVariantTargetRegions(final ActiveRegion targetRegion, final GenomeLoc variantSpan) { + final GenomeLoc targetRegionRange = targetRegion.getLocation(); + final int finalStart = variantSpan.getStart(); + final int finalStop = variantSpan.getStop(); + + final int targetStart = targetRegionRange.getStart(); + final int targetStop = targetRegionRange.getStop(); + + final boolean preTrimmingRequired = targetStart < finalStart; + final boolean postTrimmingRequired = targetStop > finalStop; + if (preTrimmingRequired) { + final String contig = targetRegionRange.getContig(); + return postTrimmingRequired ? new Pair<>( + locParser.createGenomeLoc(contig, targetStart, finalStart - 1), + locParser.createGenomeLoc(contig, finalStop + 1, targetStop)) : + new Pair<>(locParser.createGenomeLoc(contig, targetStart, finalStart - 1),GenomeLoc.UNMAPPED); + } else if (postTrimmingRequired) + return new Pair<>(GenomeLoc.UNMAPPED,locParser.createGenomeLoc(targetRegionRange.getContig(), finalStop + 1, targetStop)); + else + return new Pair<>(GenomeLoc.UNMAPPED,GenomeLoc.UNMAPPED); + } +} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResult.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResult.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResult.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResult.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResultSet.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResultSet.java new file mode 100644 index 000000000..8cadea6ec --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResultSet.java @@ -0,0 +1,543 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.ReadThreadingGraph; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.collections.CountSet; +import org.broadinstitute.sting.utils.haplotype.EventMap; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.haplotype.HaplotypeSizeAndBaseComparator; +import org.broadinstitute.variant.variantcontext.VariantContext; + +import java.io.PrintWriter; +import java.io.StringWriter; +import java.util.*; + +/** + * Collection of read assembly using several kmerSizes. + * + *

+ * There could be a different assembly per each kmerSize. In turn, haplotypes are result of one of those + * assemblies. + *

+ * + *

+ * Where there is more than one possible kmerSize that generates a haplotype we consider the smaller one. + *

+ * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.com> + */ +public class AssemblyResultSet { + + private final Map assemblyResultByKmerSize; + private final Set haplotypes; + private final Map assemblyResultByHaplotype; + private ActiveRegion regionForGenotyping; + private byte[] fullReferenceWithPadding; + private GenomeLoc paddedReferenceLoc; + private boolean variationPresent; + private Haplotype refHaplotype; + private boolean wasTrimmed = false; + private final CountSet kmerSizes; + private TreeSet variationEvents; + private boolean debug; + private static Logger logger = Logger.getLogger(AssemblyResultSet.class); + + /** + * Constructs a new empty assembly result set. + */ + public AssemblyResultSet() { + assemblyResultByKmerSize = new LinkedHashMap<>(4); + haplotypes = new LinkedHashSet<>(10); + assemblyResultByHaplotype = new LinkedHashMap<>(10); + kmerSizes = new CountSet(4); + } + + + /** + * Change the debug status for this assembly-result-set. + * @param newValue new value for the debug status. + */ + void setDebug(final boolean newValue) { + debug = newValue; + } + + /** + * Trims an assembly result set down based on a new set of trimmed haplotypes. + * + * @param trimmedActiveRegion the trimmed down active region. + * + * @throws NullPointerException if any argument in {@code null} or + * if there are {@code null} entries in {@code originalByTrimmedHaplotypes} for trimmed haplotype keys. + * @throws IllegalArgumentException if there is no reference haplotype amongst the trimmed ones. + * + * @return never {@code null}, a new trimmed assembly result set. + */ + public AssemblyResultSet trimTo(final ActiveRegion trimmedActiveRegion) { + + final Map originalByTrimmedHaplotypes = calculateOriginalByTrimmedHaplotypes(trimmedActiveRegion); + if (refHaplotype == null) throw new IllegalStateException(); + if (trimmedActiveRegion == null) throw new NullPointerException(); + final AssemblyResultSet result = new AssemblyResultSet(); + + for (final Haplotype trimmed : originalByTrimmedHaplotypes.keySet()) { + final Haplotype original = originalByTrimmedHaplotypes.get(trimmed); + if (original == null) + throw new NullPointerException("all trimmed haplotypes must have an original one"); + final AssemblyResult as = assemblyResultByHaplotype.get(original); + if (as == null) result.add(trimmed); else result.add(trimmed, as); + } + + result.setRegionForGenotyping(trimmedActiveRegion); + result.setFullReferenceWithPadding(this.fullReferenceWithPadding); + result.setPaddedReferenceLoc(this.paddedReferenceLoc); + if (result.refHaplotype == null) + throw new IllegalStateException("missing reference haplotype in the trimmed set"); + result.wasTrimmed = true; + return result; + } + + private Map calculateOriginalByTrimmedHaplotypes(final ActiveRegion trimmedActiveRegion) { + if ( debug ) logger.info("Trimming active region " + getRegionForGenotyping() + " with " + getHaplotypeCount() + " haplotypes"); + + final List haplotypeList = getHaplotypeList(); + + // trim down the haplotypes + final Map originalByTrimmedHaplotypes = new HashMap<>(); + + for ( final Haplotype h : haplotypeList ) { + final Haplotype trimmed = h.trim(trimmedActiveRegion.getExtendedLoc()); + + if ( trimmed != null ) { + if (originalByTrimmedHaplotypes.containsKey(trimmed)) { + if (trimmed.isReference()) { + originalByTrimmedHaplotypes.remove(trimmed); + originalByTrimmedHaplotypes.put(trimmed, h); + } + } else + originalByTrimmedHaplotypes.put(trimmed,h); + } else if (h.isReference()) + throw new IllegalStateException("trimming eliminates the reference haplotype"); + else if ( debug ) { + logger.info("Throwing out haplotype " + h + " with cigar " + h.getCigar() + + " because it starts with or ends with an insertion or deletion when trimmed to " + + trimmedActiveRegion.getExtendedLoc()); + } + } + + // create the final list of trimmed haplotypes + final List trimmedHaplotypes = new ArrayList<>(originalByTrimmedHaplotypes.keySet()); + + // resort the trimmed haplotypes. + Collections.sort(trimmedHaplotypes,new HaplotypeSizeAndBaseComparator()); + final Map sortedOriginalByTrimmedHaplotypes = new LinkedHashMap<>(trimmedHaplotypes.size()); + for (final Haplotype trimmed : trimmedHaplotypes) + sortedOriginalByTrimmedHaplotypes.put(trimmed,originalByTrimmedHaplotypes.get(trimmed)); + + + if ( debug ) logger.info("Trimmed region to " + trimmedActiveRegion.getLocation() + " size " + + trimmedActiveRegion.getLocation().size() + " reduced number of haplotypes from " + + haplotypeList.size() + " to only " + trimmedHaplotypes.size()); + if ( debug ) + for ( final Haplotype remaining: trimmedHaplotypes ) + logger.info("Remains: " + remaining + " cigar " + remaining.getCigar()); + return sortedOriginalByTrimmedHaplotypes; + } + + /** + * Query the reference haplotype in the result set. + * @return {@code null} if none wasn't yet added, otherwise a reference haplotype. + */ + public Haplotype getReferenceHaplotype() { + return refHaplotype; + } + + /** + * Checks whether there is any variation present in the assembly result set. + * + *

+ * This is equivalent to whether there is more than one haplotype. + *

+ * + * @return {@code true} if there is variation present, {@code false} otherwise. + */ + public boolean isVariationPresent() { + return variationPresent && haplotypes.size() > 1; + } + + /** + * Dumps debugging information into a print-writer. + * + * @param pw where to dump the information. + * + * @throws NullPointerException if {@code pw} is {@code null}. + */ + public void debugDump(final PrintWriter pw) { + if (getHaplotypeList().size() == 0) { + return; + } + pw.println("Active Region " + this.regionForGenotyping.getLocation()); + pw.println("Extended Act Region " + this.getRegionForGenotyping().getExtendedLoc()); + pw.println("Ref haplotype coords " + getHaplotypeList().get(0).getGenomeLocation()); + pw.println("Haplotype count " + haplotypes.size()); + final Map kmerSizeToCount = new HashMap<>(); + + for (final Map.Entry e : assemblyResultByHaplotype.entrySet()) { + final AssemblyResult as = e.getValue(); + final int kmerSize = as.getGraph().getKmerSize(); + if (kmerSizeToCount.containsKey(kmerSize)) { + kmerSizeToCount.put(kmerSize,kmerSizeToCount.get(kmerSize) + 1); + } else { + kmerSizeToCount.put(kmerSize,1); + } + } + pw.println("Kmer sizes count " + kmerSizeToCount.entrySet().size() ); + Integer[] kmerSizes = new Integer[kmerSizeToCount.size()]; + kmerSizes = kmerSizeToCount.keySet().toArray(kmerSizes); + Arrays.sort(kmerSizes); + pw.println("Kmer sizes values " + Arrays.toString(kmerSizes)); + for (int size : kmerSizes) { + pw.println("Kmer size " + size + " count " + kmerSizeToCount.get(size)); + } + } + + /** + * Adds a haplotype to the result set without indicating a generating assembly result. + * + *

+ * It is possible to call this method with the same haplotype several times. In that the second and further + * calls won't have any effect (thus returning {@code false}). + *

+ * + * @param h the haplotype to add to the assembly result set. + * + * @throws NullPointerException if {@code h} is {@code null} + * @throws IllegalArgumentException if {@code h} does not have a genome location. + * + * @return {@code true} if the assembly result set has been modified as a result of this call. + */ + public boolean add(final Haplotype h) { + if (h == null) throw new NullPointerException("input haplotype cannot be null"); + if (h.getGenomeLocation() == null) + throw new IllegalArgumentException("the haplotype provided must have a genomic location"); + if (haplotypes.contains(h)) + return false; + haplotypes.add(h); + updateReferenceHaplotype(h); + return true; + } + + /** + * Adds simultaneously a haplotype and the generating assembly-result. + * + *

+ * Haplotypes and their assembly-result can be added multiple times although just the first call will have + * any effect (return value is {@code true}). + *

+ * + * + * @param h haplotype to add. + * @param ar assembly-result that is assumed to have given rise to that haplotype. + * + * @throws NullPointerException if {@code h} or {@code ar} is {@code null}. + * @throws IllegalArgumentException if {@code h} has not defined genome location. + * + * @return {@code true} iff this called changes the assembly result set. + */ + public boolean add(final Haplotype h, final AssemblyResult ar) { + if (h == null) throw new NullPointerException("input haplotype cannot be null"); + if (ar == null) throw new NullPointerException("input assembly-result cannot be null"); + if (h.getGenomeLocation() == null) + throw new IllegalArgumentException("the haplotype provided must have a genomic location"); + + final boolean assemblyResultAdditionReturn = add(ar); + + if (haplotypes.contains(h)) { + final AssemblyResult previousAr = assemblyResultByHaplotype.get(h); + if (previousAr == null) { + assemblyResultByHaplotype.put(h, ar); + return true; + } else if (!previousAr.equals(ar)) + throw new IllegalStateException("there is already a different assembly result for the input haplotype"); + else + return assemblyResultAdditionReturn; + } else { + haplotypes.add(h); + assemblyResultByHaplotype.put(h,ar); + updateReferenceHaplotype(h); + if (h.isNonReference()) variationPresent = true; + return true; + } + } + + /** + * Add a assembly-result object. + * + * @param ar the assembly result to add. + * + * @throws NullPointerException if {@code ar} is {@code null}. + * @throws IllegalStateException if there is an assembly result with the same kmerSize. + * @return {@code true} iff this addition changed the assembly result set. + */ + public boolean add(final AssemblyResult ar) { + if (ar == null) + throw new NullPointerException(); + final int kmerSize = ar.getKmerSize(); + if (assemblyResultByKmerSize.containsKey(kmerSize)) { + if (!assemblyResultByKmerSize.get(kmerSize).equals(ar)) + throw new IllegalStateException("a different assembly result with the same kmerSize was already added"); + return false; + } else { + assemblyResultByKmerSize.put(kmerSize, ar); + kmerSizes.add(kmerSize); + return true; + } + } + + /** + * Returns the current region for genotyping. + * + * @return might be {@code null}. + */ + public ActiveRegion getRegionForGenotyping() { + return regionForGenotyping; + } + + /** + * Sets the region for genotyping. + * + * @param regionForGenotyping the new value. + */ + public void setRegionForGenotyping(final ActiveRegion regionForGenotyping) { + this.regionForGenotyping = regionForGenotyping; + } + + /** + * Returns the current full reference with padding. + * + * @return might be {@code null}. + */ + public byte[] getFullReferenceWithPadding() { + return fullReferenceWithPadding; + } + + /** + * Sets the full reference with padding base sequence. + * + * @param fullReferenceWithPadding the new value. + */ + public void setFullReferenceWithPadding(final byte[] fullReferenceWithPadding) { + this.fullReferenceWithPadding = fullReferenceWithPadding; + } + + /** + * Returns the padded reference location. + * + * @return might be {@code null} + */ + public GenomeLoc getPaddedReferenceLoc() { + return paddedReferenceLoc; + } + + /** + * Changes the padded reference location. + * @param paddedReferenceLoc the new value. + */ + public void setPaddedReferenceLoc(final GenomeLoc paddedReferenceLoc) { + this.paddedReferenceLoc = paddedReferenceLoc; + } + + /** + * Returns the number of haplotypes in the assembly result set. + * @return {@code 0} or greater. + */ + public int getHaplotypeCount() { + return haplotypes.size(); + } + + /** + * Returns the haplotypes as a list. + * + *

+ * The result is unmodifiable. + *

+ * + * @return never {@code null}, but perhaps a empty list if no haplotype was generated during assembly. + */ + public List getHaplotypeList() { + return Arrays.asList(haplotypes.toArray(new Haplotype[haplotypes.size()])); + } + + /** + * Returns the maximum kmerSize available. + * + * @throws IllegalStateException if no assembly-result was added to the set, thus there is no kmerSize. + * + * @return greater than 0. + */ + public int getMaximumKmerSize() { + if (kmerSizes.size() == 0) + throw new IllegalStateException("there is yet no kmerSize in this assembly result set"); + return kmerSizes.max(); + } + + /** + * Indicates whether there are more than one kmerSize in the set. + * + * @return {@code true} iff there is more than one kmerSize assembly in the set. + */ + public boolean hasMultipleKmerSizes() { + return kmerSizes.size() > 1; + } + + /** + * Returns the minimum kmerSize available. + * + * @throws IllegalStateException if no assembly-result was added to the set, thus there is no kmerSize. + * + * @return greater than 0. + */ + public int getMinimumKmerSize() { + if (kmerSizes.size() == 0) + throw new IllegalStateException("there is yet no kmerSize in this assembly result set"); + return kmerSizes.min(); + } + + /** + * Returns a read-threading graph in the assembly set that has a particular kmerSize. + * + * @param kmerSize the requested kmerSize. + * + * @return {@code null} if there is no read-threading-graph amongst assembly results with that kmerSize. + */ + public ReadThreadingGraph getUniqueReadThreadingGraph(final int kmerSize) { + final AssemblyResult assemblyResult = assemblyResultByKmerSize.get(kmerSize); + if (assemblyResult == null) return null; + return assemblyResult.getThreadingGraph(); + } + + /** + * Checks whether this assembly result set was trimmed. + * + * @return {@code true} iff this assembly result set was trimmed. + */ + public boolean wasTrimmed() { + return wasTrimmed; + } + + /** + * Marks the assembly as not having variation even if it has more than one haplotype. + */ + public void resetVariationPresent() { + variationPresent = false; + } + + /** + * Dumps debugging information into a logger. + * + * @param logger where to dump the information. + * + * @throws NullPointerException if {@code logger} is {@code null}. + */ + public void debugDump(final Logger logger) { + final StringWriter sw = new StringWriter(); + final PrintWriter pw = new PrintWriter(sw); + debugDump(pw); + final String str = sw.toString(); + final String[] lines = str.split("\n"); + for (final String line : lines) { + if (line.isEmpty()) { + continue; + } + logger.debug(line); + } + } + + /** + * Given whether a new haplotype that has been already added to {@link #haplotypes} collection is the + * reference haplotype and updates {@link #refHaplotype} accordingly. + * + *

+ * This method assumes that the colling code has verified that the haplotype was not already in {@link #haplotypes} + * I.e. that it is really a new one. Otherwise it will result in an exception if it happen to be a reference + * haplotype and this has already be set. This is the case even if the new haplotypes and the current reference + * are equal. + *

+ * + * @param newHaplotype the new haplotype. + * @throws NullPointerException if {@code newHaplotype} is {@code null}. + * @throws IllegalStateException if there is already a reference haplotype. + */ + private void updateReferenceHaplotype(final Haplotype newHaplotype) { + if (!newHaplotype.isReference()) return; + if (refHaplotype == null) + refHaplotype = newHaplotype; + else // assumes that we have checked wether the haplotype is already in the collection and so is no need to check equality. + throw new IllegalStateException("the assembly-result-set already have a reference haplotype that is different"); + } + + /** + * Returns a sorted set of variant events that best explain the haplotypes found by the assembly + * across kmerSizes. + * + *

+ * The result is sorted incrementally by location. + * + * @return never {@code null}, but perhaps an empty collection. + */ + public TreeSet getVariationEvents() { + if (variationEvents == null) { + final List haplotypeList = getHaplotypeList(); + EventMap.buildEventMapsForHaplotypes(haplotypeList,fullReferenceWithPadding,paddedReferenceLoc,debug); + variationEvents = EventMap.getAllVariantContexts(haplotypeList); + } + return variationEvents; + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/EventBlock.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/EventBlock.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/EventBlock.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/EventBlock.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/EventBlockFinder.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/EventBlockFinder.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/EventBlockFinder.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/EventBlockFinder.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java new file mode 100644 index 000000000..d65251e58 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -0,0 +1,566 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; +import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.collections.DefaultHashMap; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.EventMap; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.haplotype.MergeVariantsAcrossHaplotypes; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.*; + +import java.util.*; + +public class GenotypingEngine { + private final static Logger logger = Logger.getLogger(GenotypingEngine.class); + + private final boolean DEBUG; + private final boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS; + private final static List noCall = new ArrayList<>(); // used to noCall all genotypes until the exact model is applied + private final VariantAnnotatorEngine annotationEngine; + private final MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger; + + public GenotypingEngine( final boolean DEBUG, final VariantAnnotatorEngine annotationEngine, + final boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS, + final MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger) { + this.DEBUG = DEBUG; + this.annotationEngine = annotationEngine; + this.USE_FILTERED_READ_MAP_FOR_ANNOTATIONS = USE_FILTERED_READ_MAP_FOR_ANNOTATIONS; + noCall.add(Allele.NO_CALL); + this.crossHaplotypeEventMerger = crossHaplotypeEventMerger; + } + + /** + * Carries the result of a call to #assignGenotypeLikelihoods + */ + public static class CalledHaplotypes { + private final List calls; + private final Set calledHaplotypes; + + protected CalledHaplotypes(final List calls, final Set calledHaplotypes) { + if ( calls == null ) throw new IllegalArgumentException("calls cannot be null"); + if ( calledHaplotypes == null ) throw new IllegalArgumentException("calledHaplotypes cannot be null"); + if ( Utils.xor(calls.isEmpty(), calledHaplotypes.isEmpty()) ) + throw new IllegalArgumentException("Calls and calledHaplotypes should both be empty or both not but got calls=" + calls + " calledHaplotypes=" + calledHaplotypes); + this.calls = calls; + this.calledHaplotypes = calledHaplotypes; + } + + /** + * Get the list of calls made at this location + * @return a non-null (but potentially empty) list of calls + */ + public List getCalls() { + return calls; + } + + /** + * Get the set of haplotypes that we actually called (i.e., underlying one of the VCs in getCalls(). + * @return a non-null set of haplotypes + */ + public Set getCalledHaplotypes() { + return calledHaplotypes; + } + } + + /** + * Main entry point of class - given a particular set of haplotypes, samples and reference context, compute + * genotype likelihoods and assemble into a list of variant contexts and genomic events ready for calling + * + * The list of samples we're working with is obtained from the haplotypeReadMap + * + * @param UG_engine UG Engine with basic input parameters + * @param haplotypes Haplotypes to assign likelihoods to + * @param haplotypeReadMap Map from reads->(haplotypes,likelihoods) + * @param perSampleFilteredReadList + * @param ref Reference bytes at active region + * @param refLoc Corresponding active region genome location + * @param activeRegionWindow Active window + * @param genomeLocParser GenomeLocParser + * @param activeAllelesToGenotype Alleles to genotype + * @param emitReferenceConfidence whether we should add a <NON_REF> alternative allele to the result variation contexts. + * + * @return A CalledHaplotypes object containing a list of VC's with genotyped events and called haplotypes + * + */ + @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"}) + @Ensures("result != null") + // TODO - can this be refactored? this is hard to follow! + public CalledHaplotypes assignGenotypeLikelihoods( final UnifiedGenotyperEngine UG_engine, + final List haplotypes, + final Map haplotypeReadMap, + final Map> perSampleFilteredReadList, + final byte[] ref, + final GenomeLoc refLoc, + final GenomeLoc activeRegionWindow, + final GenomeLocParser genomeLocParser, + final RefMetaDataTracker tracker, + final List activeAllelesToGenotype, + final boolean emitReferenceConfidence) { + // sanity check input arguments + if (UG_engine == null) throw new IllegalArgumentException("UG_Engine input can't be null, got "+UG_engine); + if (haplotypes == null || haplotypes.isEmpty()) throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes); + if (haplotypeReadMap == null || haplotypeReadMap.isEmpty()) throw new IllegalArgumentException("haplotypeReadMap input should be non-empty and non-null, got "+haplotypeReadMap); + if (ref == null || ref.length == 0 ) throw new IllegalArgumentException("ref bytes input should be non-empty and non-null, got "+ref); + if (refLoc == null || refLoc.size() != ref.length) throw new IllegalArgumentException(" refLoc must be non-null and length must match ref bytes, got "+refLoc); + if (activeRegionWindow == null ) throw new IllegalArgumentException("activeRegionWindow must be non-null, got "+activeRegionWindow); + if (activeAllelesToGenotype == null ) throw new IllegalArgumentException("activeAllelesToGenotype must be non-null, got "+activeAllelesToGenotype); + if (genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser must be non-null, got "+genomeLocParser); + + // update the haplotypes so we're ready to call, getting the ordered list of positions on the reference + // that carry events among the haplotypes + final TreeSet startPosKeySet = decomposeHaplotypesIntoVariantContexts(haplotypes, haplotypeReadMap, ref, refLoc, activeAllelesToGenotype); + + // Walk along each position in the key set and create each event to be outputted + final Set calledHaplotypes = new HashSet<>(); + final List returnCalls = new ArrayList<>(); + final Map emptyDownSamplingMap = new DefaultHashMap<>(0.0); + + for( final int loc : startPosKeySet ) { + if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { // genotyping an event inside this active region + final List eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, activeAllelesToGenotype); + + if( eventsAtThisLoc.isEmpty() ) { continue; } + + // Create the event mapping object which maps the original haplotype events to the events present at just this locus + final Map> eventMapper = createEventMapper(loc, eventsAtThisLoc, haplotypes); + + // Sanity check the priority list for mistakes + final List priorityList = makePriorityList(eventsAtThisLoc); + + // Merge the event to find a common reference representation + + VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false); + + final VariantContextBuilder vcb = new VariantContextBuilder(mergedVC); + + if( mergedVC == null ) { continue; } + + final GenotypeLikelihoodsCalculationModel.Model calculationModel = mergedVC.isSNP() + ? GenotypeLikelihoodsCalculationModel.Model.SNP : GenotypeLikelihoodsCalculationModel.Model.INDEL; + + if (emitReferenceConfidence) { + final List alleleList = new ArrayList<>(); + alleleList.addAll(mergedVC.getAlleles()); + alleleList.add(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + vcb.alleles(alleleList); + mergedVC = vcb.make(); + } + + final Map mergeMap = new LinkedHashMap<>(); + mergeMap.put(null, mergedVC.getReference()); // the reference event (null) --> the reference allele + for(int iii = 0; iii < eventsAtThisLoc.size(); iii++) { + mergeMap.put(eventsAtThisLoc.get(iii), mergedVC.getAlternateAllele(iii)); // BUGBUG: This is assuming that the order of alleles is the same as the priority list given to simpleMerge function + } + + final Map> alleleMapper = createAlleleMapper(mergeMap, eventMapper); + + if( DEBUG ) { + logger.info("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles()); + } + + final Map alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().getSampleContamination() ); + + if (emitReferenceConfidence) addMiscellaneousAllele(alleleReadMap); + + final GenotypesContext genotypes = calculateGLsForThisEvent( alleleReadMap, mergedVC ); + VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), calculationModel); + if( call != null ) { + final Map alleleReadMap_annotations = ( USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ? alleleReadMap : + convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, emptyDownSamplingMap ) ); + if (emitReferenceConfidence) addMiscellaneousAllele(alleleReadMap_annotations); + final Map stratifiedReadMap = filterToOnlyOverlappingReads( genomeLocParser, alleleReadMap_annotations, perSampleFilteredReadList, call ); + + VariantContext annotatedCall = annotationEngine.annotateContextForActiveRegion(tracker, stratifiedReadMap, call); + + if( call.getAlleles().size() != mergedVC.getAlleles().size() ) + annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall); + + // maintain the set of all called haplotypes + for ( final Allele calledAllele : call.getAlleles() ) { + final List haplotypeList = alleleMapper.get(calledAllele); + if (haplotypeList == null) continue; + calledHaplotypes.addAll(haplotypeList); + } + + returnCalls.add( annotatedCall ); + } + } + } + + return new CalledHaplotypes(returnCalls, calledHaplotypes); + } + + /** + * Add the allele + * @param stratifiedReadMap target per-read-allele-likelihood-map. + */ + public static Map addMiscellaneousAllele(final Map stratifiedReadMap) { + final Allele miscellanoeusAllele = GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE; + for (Map.Entry perSample : stratifiedReadMap.entrySet()) { + for (Map.Entry> perRead : perSample.getValue().getLikelihoodReadMap().entrySet()) { + double bestLikelihood = Double.NEGATIVE_INFINITY; + double secondBestLikelihood = Double.NEGATIVE_INFINITY; + for (Map.Entry perAllele : perRead.getValue().entrySet()) { + final double value = perAllele.getValue(); + if (value > bestLikelihood) { + secondBestLikelihood = bestLikelihood; + bestLikelihood = value; + } else if (value < bestLikelihood && value > secondBestLikelihood) { + secondBestLikelihood = value; + } + } + final double miscellanousLikelihood = Double.isInfinite(secondBestLikelihood) ? bestLikelihood : secondBestLikelihood; + perSample.getValue().add(perRead.getKey(),miscellanoeusAllele,miscellanousLikelihood); + } + } + return stratifiedReadMap; + } + + /** + * Go through the haplotypes we assembled, and decompose them into their constituent variant contexts + * + * @param haplotypes the list of haplotypes we're working with + * @param haplotypeReadMap map from samples -> the per read allele likelihoods + * @param ref the reference bases (over the same interval as the haplotypes) + * @param refLoc the span of the reference bases + * @param activeAllelesToGenotype alleles we want to ensure are scheduled for genotyping (GGA mode) + * @return + */ + private TreeSet decomposeHaplotypesIntoVariantContexts(final List haplotypes, + final Map haplotypeReadMap, + final byte[] ref, + final GenomeLoc refLoc, + final List activeAllelesToGenotype) { + final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty(); + + // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file + final TreeSet startPosKeySet = EventMap.buildEventMapsForHaplotypes(haplotypes, ref, refLoc, DEBUG); + + if ( in_GGA_mode ) startPosKeySet.clear(); + + //cleanUpSymbolicUnassembledEvents( haplotypes ); // We don't make symbolic alleles so this isn't needed currently + if ( !in_GGA_mode ) { + // run the event merger if we're not in GGA mode + final boolean mergedAnything = crossHaplotypeEventMerger.merge(haplotypes, haplotypeReadMap, startPosKeySet, ref, refLoc); + if ( mergedAnything ) + cleanUpSymbolicUnassembledEvents( haplotypes ); // the newly created merged events could be overlapping the unassembled events + } + + if ( in_GGA_mode ) { + for( final VariantContext compVC : activeAllelesToGenotype ) { + startPosKeySet.add( compVC.getStart() ); + } + } + + return startPosKeySet; + } + + /** + * Get the priority list (just the list of sources for these variant context) used to merge overlapping events into common reference view + * @param vcs a list of variant contexts + * @return the list of the sources of vcs in the same order + */ + private List makePriorityList(final List vcs) { + final List priorityList = new LinkedList<>(); + for ( final VariantContext vc : vcs ) priorityList.add(vc.getSource()); + return priorityList; + } + + private List getVCsAtThisLocation(final List haplotypes, + final int loc, + final List activeAllelesToGenotype) { + // the overlapping events to merge into a common reference view + final List eventsAtThisLoc = new ArrayList<>(); + + if( activeAllelesToGenotype.isEmpty() ) { + for( final Haplotype h : haplotypes ) { + final EventMap eventMap = h.getEventMap(); + final VariantContext vc = eventMap.get(loc); + if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) { + eventsAtThisLoc.add(vc); + } + } + } else { // we are in GGA mode! + int compCount = 0; + for( final VariantContext compVC : activeAllelesToGenotype ) { + if( compVC.getStart() == loc ) { + int alleleCount = 0; + for( final Allele compAltAllele : compVC.getAlternateAlleles() ) { + List alleleSet = new ArrayList<>(2); + alleleSet.add(compVC.getReference()); + alleleSet.add(compAltAllele); + final String vcSourceName = "Comp" + compCount + "Allele" + alleleCount; + // check if this event is already in the list of events due to a repeat in the input alleles track + final VariantContext candidateEventToAdd = new VariantContextBuilder(compVC).alleles(alleleSet).source(vcSourceName).make(); + boolean alreadyExists = false; + for( final VariantContext eventToTest : eventsAtThisLoc ) { + if( eventToTest.hasSameAllelesAs(candidateEventToAdd) ) { + alreadyExists = true; + } + } + if( !alreadyExists ) { + eventsAtThisLoc.add(candidateEventToAdd); + } + alleleCount++; + } + } + compCount++; + } + } + + return eventsAtThisLoc; + } + + /** + * For a particular event described in inputVC, form PL vector for each sample by looking into allele read map and filling likelihood matrix for each allele + * @param alleleReadMap Allele map describing mapping from reads to alleles and corresponding likelihoods + * @param mergedVC Input VC with event to genotype + * @return GenotypesContext object wrapping genotype objects with PLs + */ + @Requires({"alleleReadMap!= null", "mergedVC != null"}) + @Ensures("result != null") + private GenotypesContext calculateGLsForThisEvent( final Map alleleReadMap, final VariantContext mergedVC ) { + final GenotypesContext genotypes = GenotypesContext.create(alleleReadMap.size()); + // Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample + for( final String sample : alleleReadMap.keySet() ) { + final int numHaplotypes = mergedVC.getAlleles().size(); + final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2]; + final double[][] haplotypeLikelihoodMatrix = PairHMMLikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleReadMap, mergedVC.getAlleles(), true); + int glIndex = 0; + for( int iii = 0; iii < numHaplotypes; iii++ ) { + for( int jjj = 0; jjj <= iii; jjj++ ) { + genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC + } + } + genotypes.add(new GenotypeBuilder(sample).alleles(noCall).PL(genotypeLikelihoods).make()); + } + return genotypes; + } + + private static Map filterToOnlyOverlappingReads( final GenomeLocParser parser, + final Map perSampleReadMap, + final Map> perSampleFilteredReadList, + final VariantContext call ) { + + final Map returnMap = new LinkedHashMap<>(); + final GenomeLoc callLoc = parser.createGenomeLoc(call); + for( final Map.Entry sample : perSampleReadMap.entrySet() ) { + final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap(); + + for( final Map.Entry> mapEntry : sample.getValue().getLikelihoodReadMap().entrySet() ) { + // only count the read if it overlaps the event, otherwise it is not added to the output read list at all + if( callLoc.overlapsP(parser.createGenomeLoc(mapEntry.getKey())) ) { // BUGBUG: This uses alignment start and stop, NOT soft start and soft end... + for( final Map.Entry alleleDoubleEntry : mapEntry.getValue().entrySet() ) { + likelihoodMap.add(mapEntry.getKey(), alleleDoubleEntry.getKey(), alleleDoubleEntry.getValue()); + } + } + } + + // add all filtered reads to the NO_CALL list because they weren't given any likelihoods + for( final GATKSAMRecord read : perSampleFilteredReadList.get(sample.getKey()) ) { + // only count the read if it overlaps the event, otherwise it is not added to the output read list at all + if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) { + for( final Allele allele : call.getAlleles() ) { + likelihoodMap.add(read, allele, 0.0); + } + } + } + + returnMap.put(sample.getKey(), likelihoodMap); + } + return returnMap; + } + + /** + * Removes symbolic events from list of haplotypes + * @param haplotypes Input/output list of haplotypes, before/after removal + */ + // TODO - split into input haplotypes and output haplotypes as not to share I/O arguments + @Requires("haplotypes != null") + protected static void cleanUpSymbolicUnassembledEvents( final List haplotypes ) { + final List haplotypesToRemove = new ArrayList<>(); + for( final Haplotype h : haplotypes ) { + for( final VariantContext vc : h.getEventMap().getVariantContexts() ) { + if( vc.isSymbolic() ) { + for( final Haplotype h2 : haplotypes ) { + for( final VariantContext vc2 : h2.getEventMap().getVariantContexts() ) { + if( vc.getStart() == vc2.getStart() && (vc2.isIndel() || vc2.isMNP()) ) { // unfortunately symbolic alleles can't currently be combined with non-point events + haplotypesToRemove.add(h); + break; + } + } + } + } + } + } + haplotypes.removeAll(haplotypesToRemove); + } + + // BUGBUG: ugh, too complicated + protected Map convertHaplotypeReadMapToAlleleReadMap( final Map haplotypeReadMap, + final Map> alleleMapper, + final Map perSampleDownsamplingFraction ) { + + final Map alleleReadMap = new LinkedHashMap<>(); + for( final Map.Entry haplotypeReadMapEntry : haplotypeReadMap.entrySet() ) { // for each sample + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap(); + for( final Map.Entry> alleleMapperEntry : alleleMapper.entrySet() ) { // for each output allele + final List mappedHaplotypes = alleleMapperEntry.getValue(); + for( final Map.Entry> readEntry : haplotypeReadMapEntry.getValue().getLikelihoodReadMap().entrySet() ) { // for each read + double maxLikelihood = Double.NEGATIVE_INFINITY; + for( final Map.Entry alleleDoubleEntry : readEntry.getValue().entrySet() ) { // for each input allele + if( mappedHaplotypes.contains( new Haplotype(alleleDoubleEntry.getKey())) ) { // exact match of haplotype base string + maxLikelihood = Math.max( maxLikelihood, alleleDoubleEntry.getValue() ); + } + } + perReadAlleleLikelihoodMap.add(readEntry.getKey(), alleleMapperEntry.getKey(), maxLikelihood); + } + } + perReadAlleleLikelihoodMap.performPerAlleleDownsampling(perSampleDownsamplingFraction.get(haplotypeReadMapEntry.getKey())); // perform contamination downsampling + alleleReadMap.put(haplotypeReadMapEntry.getKey(), perReadAlleleLikelihoodMap); + } + + return alleleReadMap; + } + + protected static Map> createAlleleMapper( final Map mergeMap, final Map> eventMap ) { + final Map> alleleMapper = new LinkedHashMap<>(); + for( final Map.Entry entry : mergeMap.entrySet() ) { + alleleMapper.put(entry.getValue(), eventMap.get(new Event(entry.getKey()))); + } + return alleleMapper; + } + + @Requires({"haplotypes.size() >= eventsAtThisLoc.size() + 1"}) + @Ensures({"result.size() == eventsAtThisLoc.size() + 1"}) + protected static Map> createEventMapper( final int loc, final List eventsAtThisLoc, final List haplotypes ) { + + final Map> eventMapper = new LinkedHashMap<>(eventsAtThisLoc.size()+1); + final Event refEvent = new Event(null); + eventMapper.put(refEvent, new ArrayList()); + for( final VariantContext vc : eventsAtThisLoc ) { + eventMapper.put(new Event(vc), new ArrayList()); + } + + for( final Haplotype h : haplotypes ) { + if( h.getEventMap().get(loc) == null ) { + eventMapper.get(refEvent).add(h); + } else { + for( final VariantContext vcAtThisLoc : eventsAtThisLoc ) { + if( h.getEventMap().get(loc).hasSameAllelesAs(vcAtThisLoc) ) { + eventMapper.get(new Event(vcAtThisLoc)).add(h); + break; + } + } + } + } + + return eventMapper; + } + + @Ensures({"result.size() == haplotypeAllelesForSample.size()"}) + protected static List findEventAllelesInSample( final List eventAlleles, final List haplotypeAlleles, final List haplotypeAllelesForSample, final List> alleleMapper, final List haplotypes ) { + if( haplotypeAllelesForSample.contains(Allele.NO_CALL) ) { return noCall; } + final List eventAllelesForSample = new ArrayList<>(); + for( final Allele a : haplotypeAllelesForSample ) { + final Haplotype haplotype = haplotypes.get(haplotypeAlleles.indexOf(a)); + for( int iii = 0; iii < alleleMapper.size(); iii++ ) { + final List mappedHaplotypes = alleleMapper.get(iii); + if( mappedHaplotypes.contains(haplotype) ) { + eventAllelesForSample.add(eventAlleles.get(iii)); + break; + } + } + } + return eventAllelesForSample; + } + + @Deprecated + protected static Map generateVCsFromAlignment( final Haplotype haplotype, final byte[] ref, final GenomeLoc refLoc, final String sourceNameToAdd ) { + return new EventMap(haplotype, ref, refLoc, sourceNameToAdd); + } + + protected static boolean containsVCWithMatchingAlleles( final List list, final VariantContext vcToTest ) { + for( final VariantContext vc : list ) { + if( vc.hasSameAllelesAs(vcToTest) ) { + return true; + } + } + return false; + } + + protected static class Event { + public VariantContext vc; + + public Event( final VariantContext vc ) { + this.vc = vc; + } + + @Override + public boolean equals( final Object obj ) { + return obj instanceof Event && ((((Event) obj).vc == null && vc == null) || (((Event) obj).vc != null && vc != null && ((Event) obj).vc.hasSameAllelesAs(vc))) ; + } + + @Override + public int hashCode() { + return (vc == null ? -1 : vc.getAlleles().hashCode()); + } + } +} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java new file mode 100644 index 000000000..276103277 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -0,0 +1,1161 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import com.google.java.contract.Ensures; +import net.sf.samtools.SAMFileWriter; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; +import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.downsampling.AlleleBiasedDownsamplingUtils; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingUtils; +import org.broadinstitute.sting.gatk.filters.BadMateFilter; +import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; +import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; +import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.ReadThreadingAssembler; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState; +import org.broadinstitute.sting.utils.activeregion.ActivityProfileState; +import org.broadinstitute.sting.utils.clipping.ReadClipper; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.fragments.FragmentCollection; +import org.broadinstitute.sting.utils.fragments.FragmentUtils; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.gvcf.GVCFWriter; +import org.broadinstitute.sting.utils.haplotype.*; +import org.broadinstitute.sting.utils.haplotypeBAMWriter.HaplotypeBAMWriter; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.pairhmm.PairHMM; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.sting.utils.variant.GATKVCFIndexType; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.*; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; +import org.broadinstitute.variant.vcf.*; + +import java.io.FileNotFoundException; +import java.io.PrintStream; +import java.util.*; + +/** + * Call SNPs and indels simultaneously via local de-novo assembly of haplotypes in an active region. Haplotypes are evaluated using an affine gap penalty Pair HMM. + * + *

Input

+ *

+ * Input bam file(s) from which to make calls + *

+ * + *

Output

+ *

+ * VCF file with raw, unrecalibrated SNP and indel calls. + *

+ * + *

Examples

+ *
+ *   java
+ *     -jar GenomeAnalysisTK.jar
+ *     -T HaplotypeCaller
+ *     -R reference/human_g1k_v37.fasta
+ *     -I sample1.bam [-I sample2.bam ...] \
+ *     --dbsnp dbSNP.vcf \
+ *     -stand_call_conf [50.0] \
+ *     -stand_emit_conf 10.0 \
+ *     [-L targets.interval_list]
+ *     -o output.raw.snps.indels.vcf
+ * 
+ * + *

Caveats

+ *
    + *
  • The system is under active and continuous development. All outputs, the underlying likelihood model, and command line arguments are likely to change often.
  • + *
+ * + * @author rpoplin + * @since 8/22/11 + */ + +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} ) +@PartitionBy(PartitionType.LOCUS) +@BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN) +@ActiveRegionTraversalParameters(extension=100, maxRegion=300) +@ReadFilters({HCMappingQualityFilter.class}) +@Downsample(by= DownsampleType.BY_SAMPLE, toCoverage=250) +public class HaplotypeCaller extends ActiveRegionWalker, Integer> implements AnnotatorCompatible, NanoSchedulable { + // ----------------------------------------------------------------------------------------------- + // general haplotype caller arguments + // ----------------------------------------------------------------------------------------------- + + /** + * A raw, unfiltered, highly sensitive callset in VCF format. + */ + @Output(doc="File to which variants should be written") + protected VariantContextWriter vcfWriter = null; + + @Hidden + @Advanced + @Argument(fullName="likelihoodCalculationEngine",shortName="likelihoodEngine", + doc="what likelihood calculation engine to use to calculate the relative likelihood of reads vs haplotypes",required=false) + protected LikelihoodCalculationEngine.Implementation likelihoodEngineImplementation = LikelihoodCalculationEngine.Implementation.PairHMM; + + @Hidden + @Advanced + @Argument(fullName="heterogeneousKmerSizeResolution",shortName="hksr",doc="how to solve heterogeneous kmer situations using the fast method",required=false) + protected HeterogeneousKmerSizeResolution heterogeneousKmerSizeResultion = HeterogeneousKmerSizeResolution.COMBO_MIN; + + @Output(fullName="graphOutput", shortName="graph", doc="File to which debug assembly graph information should be written", required = false, defaultToStdout = false) + protected PrintStream graphWriter = null; + + /** + * The assembled haplotypes will be written as BAM to this file if requested. Really for debugging purposes only. + * Note that the output here does not include uninformative reads so that not every input read is emitted to the bam. + * + * Turning on this mode may result in serious performance cost for the HC. It's really only appropriate to + * use in specific areas where you want to better understand why the HC is making specific calls. + * + * The reads are written out containing a HC tag (integer) that encodes which haplotype each read best matches + * according to the haplotype caller's likelihood calculation. The use of this tag is primarily intended + * to allow good coloring of reads in IGV. Simply go to Color Alignments By > Tag and enter HC to more + * easily see which reads go with these haplotype. + * + * Note that the haplotypes (called or all, depending on mode) are emitted as single reads covering the entire + * active region, coming from read HC and a special read group. + * + * Note that only reads that are actually informative about the haplotypes are emitted. By informative we mean + * that there's a meaningful difference in the likelihood of the read coming from one haplotype compared to + * its next best haplotype. + * + * The best way to visualize the output of this mode is with IGV. Tell IGV to color the alignments by tag, + * and give it the HC tag, so you can see which reads support each haplotype. Finally, you can tell IGV + * to group by sample, which will separate the potential haplotypes from the reads. All of this can be seen + * in the following screenshot: https://www.dropbox.com/s/xvy7sbxpf13x5bp/haplotypecaller%20bamout%20for%20docs.png + * + */ + @Advanced + @Output(fullName="bamOutput", shortName="bamout", doc="File to which assembled haplotypes should be written", required = false, defaultToStdout = false) + protected StingSAMFileWriter bamWriter = null; + private HaplotypeBAMWriter haplotypeBAMWriter; + + /** + * The type of BAM output we want to see. + */ + @Advanced + @Argument(fullName="bamWriterType", shortName="bamWriterType", doc="How should haplotypes be written to the BAM?", required = false) + public HaplotypeBAMWriter.Type bamWriterType = HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES; + + /** + * rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate. + * dbSNP is not used in any way for the calculations themselves. + */ + @ArgumentCollection + protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); + private double log10GlobalReadMismappingRate; + + /** + * Active region trimmer reference. + */ + @ArgumentCollection + protected ActiveRegionTrimmer trimmer = new ActiveRegionTrimmer(); + + public RodBinding getDbsnpRodBinding() { return dbsnp.dbsnp; } + + /** + * If a call overlaps with a record from the provided comp track, the INFO field will be annotated + * as such in the output with the track name (e.g. -comp:FOO will have 'FOO' in the INFO field). + * Records that are filtered in the comp track will be ignored. + * Note that 'dbSNP' has been special-cased (see the --dbsnp argument). + */ + @Advanced + @Input(fullName="comp", shortName = "comp", doc="comparison VCF file", required=false) + public List> comps = Collections.emptyList(); + public List> getCompRodBindings() { return comps; } + + // The following are not used by the Unified Genotyper + public RodBinding getSnpEffRodBinding() { return null; } + public List> getResourceRodBindings() { return Collections.emptyList(); } + public boolean alwaysAppendDbsnpId() { return false; } + + /** + * Which annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available annotations. + */ + @Advanced + @Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false) + protected List annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"ClippingRankSumTest", "DepthPerSampleHC"})); + + /** + * Which annotations to exclude from output in the VCF file. Note that this argument has higher priority than the -A or -G arguments, + * so annotations will be excluded even if they are explicitly included with the other options. + */ + @Advanced + @Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false) + protected List annotationsToExclude = new ArrayList<>(Arrays.asList(new String[]{"SpanningDeletions", "TandemRepeatAnnotator"})); + + /** + * Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups. + */ + @Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false) + protected String[] annotationClassesToUse = { "Standard" }; + + @ArgumentCollection + private StandardCallerArgumentCollection SCAC = new StandardCallerArgumentCollection(); + + // ----------------------------------------------------------------------------------------------- + // arguments to control internal behavior of the read threading assembler + // ----------------------------------------------------------------------------------------------- + + @Advanced + @Argument(fullName="kmerSize", shortName="kmerSize", doc="Kmer size to use in the read threading assembler", required = false) + protected List kmerSizes = Arrays.asList(10, 25); + + @Advanced + @Argument(fullName="dontIncreaseKmerSizesForCycles", shortName="dontIncreaseKmerSizesForCycles", doc="Should we disable the iterating over kmer sizes when graph cycles are detected?", required = false) + protected boolean dontIncreaseKmerSizesForCycles = false; + + @Advanced + @Argument(fullName="numPruningSamples", shortName="numPruningSamples", doc="The number of samples that must pass the minPuning factor in order for the path to be kept", required = false) + protected int numPruningSamples = 1; + + @Hidden + @Argument(fullName="dontRecoverDanglingTails", shortName="dontRecoverDanglingTails", doc="Should we disable dangling tail recovery in the read threading assembler?", required = false) + protected boolean dontRecoverDanglingTails = false; + + // ----------------------------------------------------------------------------------------------- + // general advanced arguments to control haplotype caller behavior + // ----------------------------------------------------------------------------------------------- + + /** + * The reference confidence mode makes it possible to emit a per-bp or summarized confidence estimate for a site being strictly homozygous-reference. + * See http://www.broadinstitute.org/gatk/guide/article?id=2940 for more details of how this works. + * Note that if you set -ERC GVCF, you also need to set -variant_index_type LINEAR and -variant_index_parameter 128000 (with those exact values!). + * This requirement is a temporary workaround for an issue with index compression. + */ + @Advanced + @Argument(fullName="emitRefConfidence", shortName="ERC", doc="Mode for emitting experimental reference confidence scores", required = false) + protected ReferenceConfidenceMode emitReferenceConfidence = ReferenceConfidenceMode.NONE; + + public enum ReferenceConfidenceMode { + NONE, + BP_RESOLUTION, + GVCF + } + + /** + * The GQ partition intervals + * + * Should be a non-empty list of boundaries. For example, suppose this variable is + * + * [A, B, C] + * + * We would partition our hom-ref sites into the following bands: + * + * X < A + * A <= X < B + * B <= X < C + * X >= C + * + * The default bands with (1, 10, 20, 30, 40, 50) give the following GQ blocks: + * + * [0, 0] + * (0, 10] + * (10, 20] + * (20, 30] + * (30, 40] + * (40, 50] + * (50, 99] + * + * Note that in the GATK GQ values are capped at 99. + */ + @Advanced + @Argument(fullName="GVCFGQBands", shortName="GQB", doc="Emit experimental reference confidence scores", required = false) + protected List GVCFGQBands = Arrays.asList(5, 20, 60); + + /** + * This parameter determines the maximum size of an indel considered as potentially segregating in the + * reference model. It is used to eliminate reads from being indel informative at a site, and determines + * by that mechanism the certainty in the reference base. Conceptually, setting this parameter to + * X means that each informative read is consistent with any indel of size < X being present at a specific + * position in the genome, given its alignment to the reference. + */ + @Advanced + @Argument(fullName="indelSizeToEliminateInRefModel", shortName="ERCIS", doc="The size of an indel to check for in the reference model", required = false) + protected int indelSizeToEliminateInRefModel = 10; + + // ----------------------------------------------------------------------------------------------- + // general advanced arguments to control haplotype caller behavior + // ----------------------------------------------------------------------------------------------- + + /** + * The minimum confidence needed for a given base for it to be used in variant calling. + */ + @Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false) + public byte MIN_BASE_QUALTY_SCORE = 10; + + /** + * Users should be aware that this argument can really affect the results of the variant calling and should exercise caution. + * Using a prune factor of 1 (or below) will prevent any pruning from the graph which is generally not ideal; it can make the + * calling much slower and even less accurate (because it can prevent effective merging of "tails" in the graph). Higher values + * tend to make the calling much faster, but also lowers the sensitivity of the results (because it ultimately requires higher + * depth to produce calls). + */ + @Advanced + @Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with < X supporting kmers are pruned from the graph", required = false) + protected int MIN_PRUNE_FACTOR = 2; + + @Advanced + @Argument(fullName="gcpHMM", shortName="gcpHMM", doc="Flat gap continuation penalty for use in the Pair HMM", required = false) + protected int gcpHMM = 10; + + /** + * If this flag is provided, the haplotype caller will include unmapped reads in the assembly and calling + * when these reads occur in the region being analyzed. Typically, for paired end analyses, one pair of the + * read can map, but if its pair is too divergent then it may be unmapped and placed next to its mate, taking + * the mates contig and alignment start. If this flag is provided the haplotype caller will see such reads, + * and may make use of them in assembly and calling, where possible. + */ + @Hidden + @Argument(fullName="includeUmappedReads", shortName="unmapped", doc="If provided, unmapped reads with chromosomal coordinates (i.e., those placed to their maps) will be included in the assembly and calling", required = false) + protected boolean includeUnmappedReads = false; + + @Advanced + @Argument(fullName="useAllelesTrigger", shortName="allelesTrigger", doc = "If specified, use additional trigger on variants found in an external alleles file", required=false) + protected boolean USE_ALLELES_TRIGGER = false; + + @Advanced + @Argument(fullName="useFilteredReadsForAnnotations", shortName="useFilteredReadsForAnnotations", doc = "If specified, use the contamination-filtered read maps for the purposes of annotating variants", required=false) + protected boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS = false; + + /** + * The phredScaledGlobalReadMismappingRate reflects the average global mismapping rate of all reads, regardless of their + * mapping quality. This term effects the probability that a read originated from the reference haplotype, regardless of + * its edit distance from the reference, in that the read could have originated from the reference haplotype but + * from another location in the genome. Suppose a read has many mismatches from the reference, say like 5, but + * has a very high mapping quality of 60. Without this parameter, the read would contribute 5 * Q30 evidence + * in favor of its 5 mismatch haplotype compared to reference, potentially enough to make a call off that single + * read for all of these events. With this parameter set to Q30, though, the maximum evidence against the reference + * that this (and any) read could contribute against reference is Q30. + * + * Set this term to any negative number to turn off the global mapping rate + */ + @Advanced + @Argument(fullName="phredScaledGlobalReadMismappingRate", shortName="globalMAPQ", doc="The global assumed mismapping rate for reads", required = false) + protected int phredScaledGlobalReadMismappingRate = 45; + + /** + * Assembly graph can be quite complex, and could imply a very large number of possible haplotypes. Each haplotype + * considered requires N PairHMM evaluations if there are N reads across all samples. In order to control the + * run of the haplotype caller we only take maxNumHaplotypesInPopulation paths from the graph, in order of their + * weights, no matter how many paths are possible to generate from the graph. Putting this number too low + * will result in dropping true variation because paths that include the real variant are not even considered. + */ + @Advanced + @Argument(fullName="maxNumHaplotypesInPopulation", shortName="maxNumHaplotypesInPopulation", doc="Maximum number of haplotypes to consider for your population. This number will probably need to be increased when calling organisms with high heterozygosity.", required = false) + protected int maxNumHaplotypesInPopulation = 128; + + @Advanced + @Argument(fullName="mergeVariantsViaLD", shortName="mergeVariantsViaLD", doc="If specified, we will merge variants together into block substitutions that are in strong local LD", required = false) + protected boolean mergeVariantsViaLD = false; + + // ----------------------------------------------------------------------------------------------- + // arguments for debugging / developing the haplotype caller + // ----------------------------------------------------------------------------------------------- + /** + * The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime. + */ + @Hidden + @Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for genotype likelihood calculations", required = false) + public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING; + + @Hidden + @Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false) + protected String keepRG = null; + + @Hidden + @Argument(fullName="justDetermineActiveRegions", shortName="justDetermineActiveRegions", doc = "If specified, the HC won't actually do any assembly or calling, it'll just run the upfront active region determination code. Useful for benchmarking and scalability testing", required=false) + protected boolean justDetermineActiveRegions = false; + + @Hidden + @Argument(fullName="dontGenotype", shortName="dontGenotype", doc = "If specified, the HC will do any assembly but won't do calling. Useful for benchmarking and scalability testing", required=false) + protected boolean dontGenotype = false; + + @Hidden + @Argument(fullName="errorCorrectKmers", shortName="errorCorrectKmers", doc = "Use an exploratory algorithm to error correct the kmers used during assembly. May cause fundamental problems with the assembly graph itself", required=false) + protected boolean errorCorrectKmers = false; + + @Advanced + @Argument(fullName="debug", shortName="debug", doc="If specified, print out very verbose debug information about each triggering active region", required = false) + protected boolean DEBUG; + + @Hidden + @Argument(fullName="debugGraphTransformations", shortName="debugGraphTransformations", doc="If specified, we will write DOT formatted graph files out of the assembler for only this graph size", required = false) + protected boolean debugGraphTransformations = false; + + @Hidden + @Argument(fullName="dontUseSoftClippedBases", shortName="dontUseSoftClippedBases", doc="If specified, we will not analyze soft clipped bases in the reads", required = false) + protected boolean dontUseSoftClippedBases = false; + + @Hidden + @Argument(fullName="captureAssemblyFailureBAM", shortName="captureAssemblyFailureBAM", doc="If specified, we will write a BAM called assemblyFailure.bam capturing all of the reads that were in the active region when the assembler failed for any reason", required = false) + protected boolean captureAssemblyFailureBAM = false; + + @Hidden + @Argument(fullName="allowCyclesInKmerGraphToGeneratePaths", shortName="allowCyclesInKmerGraphToGeneratePaths", doc="If specified, we will allow cycles in the kmer graphs to generate paths with multiple copies of the path sequenece rather than just the shortest paths", required = false) + protected boolean allowCyclesInKmerGraphToGeneratePaths = false; + + @Hidden + @Argument(fullName="noFpga", shortName="noFpga", doc="If provided, disables the use of the FPGA HMM implementation", required = false) + protected boolean noFpga = false; + + // Parameters to control read error correction + @Hidden + @Argument(fullName="errorCorrectReads", shortName="errorCorrectReads", doc = "Use an exploratory algorithm to error correct the kmers used during assembly. May cause fundamental problems with the assembly graph itself", required=false) + protected boolean errorCorrectReads = false; + + @Hidden + @Argument(fullName="kmerLengthForReadErrorCorrection", shortName="kmerLengthForReadErrorCorrection", doc = "Use an exploratory algorithm to error correct the kmers used during assembly. May cause fundamental problems with the assembly graph itself", required=false) + protected int kmerLengthForReadErrorCorrection = 25; + + @Hidden + @Argument(fullName="minObservationsForKmerToBeSolid", shortName="minObservationsForKmerToBeSolid", doc = "A k-mer must be seen at least these times for it considered to be solid", required=false) + protected int minObservationsForKmerToBeSolid = 20; + + /** + * Which PCR indel error model should we use when calculating likelihoods? If NONE is selected, then the default base + * insertion/deletion qualities will be used (or taken from the read if generated through the BaseRecalibrator). + * VERY IMPORTANT: when using PCR-free sequencing data we definitely recommend setting this argument to NONE. + */ + @Advanced + @Argument(fullName = "pcr_indel_model", shortName = "pcrModel", doc = "The PCR indel model to use", required = false) + public PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL pcrErrorModel = PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.CONSERVATIVE; + + // ----------------------------------------------------------------------------------------------- + // done with Haplotype caller parameters + // ----------------------------------------------------------------------------------------------- + + // the UG engines + private UnifiedGenotyperEngine UG_engine = null; + private UnifiedGenotyperEngine UG_engine_simple_genotyper = null; + + // the assembly engine + private LocalAssemblyEngine assemblyEngine = null; + + // the likelihoods engine + private LikelihoodCalculationEngine likelihoodCalculationEngine = null; + + // the genotyping engine + private GenotypingEngine genotypingEngine = null; + + // fasta reference reader to supplement the edges of the reference sequence + protected CachingIndexedFastaSequenceFile referenceReader; + + // reference base padding size + private static final int REFERENCE_PADDING = 500; + + private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument + private final static int minReadsPerAlignmentStart = 5; // TODO -- should be an argument + + private byte MIN_TAIL_QUALITY; + private static final byte MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION = 6; + + // the minimum length of a read we'd consider using for genotyping + private final static int MIN_READ_LENGTH = 10; + + private List samplesList = new ArrayList<>(); + + private final static Allele FAKE_REF_ALLELE = Allele.create("N", true); // used in isActive function to call into UG Engine. Should never appear anywhere in a VCF file + private final static Allele FAKE_ALT_ALLELE = Allele.create("", false); // used in isActive function to call into UG Engine. Should never appear anywhere in a VCF file + + ReferenceConfidenceModel referenceConfidenceModel = null; + + // as determined experimentally Nov-Dec 2013 + public final static GATKVCFIndexType OPTIMAL_GVCF_INDEX_TYPE = GATKVCFIndexType.LINEAR; + public final static int OPTIMAL_GVCF_INDEX_PARAMETER = 128000; + + //--------------------------------------------------------------------------------------------------------------- + // + // initialize + // + //--------------------------------------------------------------------------------------------------------------- + + public void initialize() { + super.initialize(); + + if (dontGenotype && emitReferenceConfidence == ReferenceConfidenceMode.GVCF) + throw new UserException("You cannot request gVCF output and do not genotype at the same time"); + + if ( emitReferenceConfidence() ) { + SCAC.STANDARD_CONFIDENCE_FOR_EMITTING = -0.0; + SCAC.STANDARD_CONFIDENCE_FOR_CALLING = -0.0; + + // also, we don't need to output several of the annotations + annotationsToExclude.add("ChromosomeCounts"); + annotationsToExclude.add("FisherStrand"); + annotationsToExclude.add("QualByDepth"); + + // but we definitely want certain other ones + annotationsToUse.add("StrandBiasBySample"); + logger.info("Standard Emitting and Calling confidence set to 0.0 for reference-model confidence output"); + } + + if ( SCAC.AFmodel == AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY ) + throw new UserException.BadArgumentValue("pnrm", "HaplotypeCaller doesn't currently support " + SCAC.AFmodel); + + // get all of the unique sample names + Set samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); + samplesList.addAll( samples ); + // initialize the UnifiedGenotyper Engine which is used to call into the exact model + final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user + // HC GGA mode depends critically on EMIT_ALL_SITES being set for the UG engine + UAC.OutputMode = SCAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES + ? UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES : UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; + UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY); + + if (emitReferenceConfidence() && !UG_engine.getUAC().annotateAllSitesWithPLs) { + UG_engine.getUAC().annotateAllSitesWithPLs = true; + logger.info("All sites annotated with PLs force to true for reference-model confidence output"); + } + // create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested + UnifiedArgumentCollection simpleUAC = new UnifiedArgumentCollection(UAC); + simpleUAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; + simpleUAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; + simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); // low values used for isActive determination only, default/user-specified values used for actual calling + simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); // low values used for isActive determination only, default/user-specified values used for actual calling + simpleUAC.CONTAMINATION_FRACTION = 0.0; + simpleUAC.CONTAMINATION_FRACTION_FILE = null; + simpleUAC.exactCallsLog = null; + UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY); + + if( UAC.CONTAMINATION_FRACTION_FILE != null ) { + UAC.setSampleContamination(AlleleBiasedDownsamplingUtils.loadContaminationFile(UAC.CONTAMINATION_FRACTION_FILE, UAC.CONTAMINATION_FRACTION, samples, logger)); + } + + // initialize the output VCF header + final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); + + Set headerInfo = new HashSet<>(); + + // all annotation fields from VariantAnnotatorEngine + headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions()); + // all callers need to add these standard annotation header lines + VCFStandardHeaderLines.addStandardInfoLines(headerInfo, true, + VCFConstants.DOWNSAMPLED_KEY, + VCFConstants.MLE_ALLELE_COUNT_KEY, + VCFConstants.MLE_ALLELE_FREQUENCY_KEY); + // all callers need to add these standard FORMAT field header lines + VCFStandardHeaderLines.addStandardFormatLines(headerInfo, true, + VCFConstants.GENOTYPE_KEY, + VCFConstants.GENOTYPE_QUALITY_KEY, + VCFConstants.DEPTH_KEY, + VCFConstants.GENOTYPE_PL_KEY); + + // FILTER fields are added unconditionally as it's not always 100% certain the circumstances + // where the filters are used. For example, in emitting all sites the lowQual field is used + headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotyperEngine.LOW_QUAL_FILTER_NAME, "Low quality")); + + initializeReferenceConfidenceModel(samples, headerInfo); + + vcfWriter.writeHeader(new VCFHeader(headerInfo, samples)); + + try { + // fasta reference reader to supplement the edges of the reference sequence + referenceReader = new CachingIndexedFastaSequenceFile(getToolkit().getArguments().referenceFile); + } catch( FileNotFoundException e ) { + throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e); + } + + // create and setup the assembler + assemblyEngine = new ReadThreadingAssembler(maxNumHaplotypesInPopulation, kmerSizes, dontIncreaseKmerSizesForCycles, numPruningSamples); + + assemblyEngine.setErrorCorrectKmers(errorCorrectKmers); + assemblyEngine.setPruneFactor(MIN_PRUNE_FACTOR); + assemblyEngine.setDebug(DEBUG); + assemblyEngine.setDebugGraphTransformations(debugGraphTransformations); + assemblyEngine.setAllowCyclesInKmerGraphToGeneratePaths(allowCyclesInKmerGraphToGeneratePaths); + assemblyEngine.setRecoverDanglingTails(!dontRecoverDanglingTails); + assemblyEngine.setMinBaseQualityToUseInAssembly(MIN_BASE_QUALTY_SCORE); + + MIN_TAIL_QUALITY = (byte)(MIN_BASE_QUALTY_SCORE - 1); + + if ( graphWriter != null ) assemblyEngine.setGraphWriter(graphWriter); + + // setup the likelihood calculation engine + if ( phredScaledGlobalReadMismappingRate < 0 ) phredScaledGlobalReadMismappingRate = -1; + + // configure the global mismapping rate + if ( phredScaledGlobalReadMismappingRate < 0 ) { + log10GlobalReadMismappingRate = - Double.MAX_VALUE; + } else { + log10GlobalReadMismappingRate = QualityUtils.qualToErrorProbLog10(phredScaledGlobalReadMismappingRate); + logger.info("Using global mismapping rate of " + phredScaledGlobalReadMismappingRate + " => " + log10GlobalReadMismappingRate + " in log10 likelihood units"); + } + + // create our likelihood calculation engine + likelihoodCalculationEngine = createLikelihoodCalculationEngine(); + + final MergeVariantsAcrossHaplotypes variantMerger = mergeVariantsViaLD ? new LDMerger(DEBUG, 10, 1) : new MergeVariantsAcrossHaplotypes(); + + genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS, variantMerger ); + + if ( bamWriter != null ) { + // we currently do not support multi-threaded BAM writing, so exception out + if ( getToolkit().getTotalNumberOfThreads() > 1 ) + throw new UserException.BadArgumentValue("bamout", "Currently cannot emit a BAM file from the HaplotypeCaller in multi-threaded mode."); + haplotypeBAMWriter = HaplotypeBAMWriter.create(bamWriterType, bamWriter, getToolkit().getSAMFileHeader()); + } + + trimmer.initialize(getToolkit().getGenomeLocParser(), DEBUG, + UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES,emitReferenceConfidence()); + } + + private void initializeReferenceConfidenceModel(final Set samples, final Set headerInfo) { + referenceConfidenceModel = new ReferenceConfidenceModel(getToolkit().getGenomeLocParser(), samples, getToolkit().getSAMFileHeader(), indelSizeToEliminateInRefModel); + if ( emitReferenceConfidence() ) { + if ( samples.size() != 1 ) throw new UserException.BadArgumentValue("emitRefConfidence", "Can only be used in single sample mode currently"); + headerInfo.addAll(referenceConfidenceModel.getVCFHeaderLines()); + if ( emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) { + // a kluge to enforce the use of this indexing strategy + if (getToolkit().getArguments().variant_index_type != OPTIMAL_GVCF_INDEX_TYPE || + getToolkit().getArguments().variant_index_parameter != OPTIMAL_GVCF_INDEX_PARAMETER) { + throw new UserException.GVCFIndexException(OPTIMAL_GVCF_INDEX_TYPE, OPTIMAL_GVCF_INDEX_PARAMETER); + } + + try { + vcfWriter = new GVCFWriter(vcfWriter, GVCFGQBands); + } catch ( IllegalArgumentException e ) { + throw new UserException.BadArgumentValue("GQBands", "are malformed: " + e.getMessage()); + } + } + } + } + + /** + * Instantiates the appropriate likelihood calculation engine. + * + * @return never {@code null}. + */ + private LikelihoodCalculationEngine createLikelihoodCalculationEngine() { + switch (likelihoodEngineImplementation) { + case PairHMM: + return new PairHMMLikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM, log10GlobalReadMismappingRate, noFpga, pcrErrorModel ); + case GraphBased: + return new GraphBasedLikelihoodCalculationEngine( (byte)gcpHMM,log10GlobalReadMismappingRate,heterogeneousKmerSizeResultion,DEBUG,debugGraphTransformations); + case Random: + return new RandomLikelihoodCalculationEngine(); + default: + //Note: we do not include in the error message list as it is of no grand public interest. + throw new UserException("Unsupported likelihood calculation engine '" + likelihoodCalculationEngine + + "'. Please use one of the following instead: 'PairHMM' and 'GraphBased'."); + } + } + + //--------------------------------------------------------------------------------------------------------------- + // + // isActive + // + //--------------------------------------------------------------------------------------------------------------- + + // enable deletions in the pileup + @Override + public boolean includeReadsWithDeletionAtLoci() { return true; } + + // enable non primary and extended reads in the active region + @Override + public EnumSet desiredReadStates() { + if ( includeUnmappedReads ) { + throw new UserException.BadArgumentValue("includeUnmappedReads", "is not yet functional"); +// return EnumSet.of( +// ActiveRegionReadState.PRIMARY, +// ActiveRegionReadState.NONPRIMARY, +// ActiveRegionReadState.EXTENDED, +// ActiveRegionReadState.UNMAPPED +// ); + } else + return EnumSet.of( + ActiveRegionReadState.PRIMARY, + ActiveRegionReadState.NONPRIMARY, + ActiveRegionReadState.EXTENDED + ); + } + + @Override + @Ensures({"result.isActiveProb >= 0.0", "result.isActiveProb <= 1.0"}) + public ActivityProfileState isActive( final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context ) { + + if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { + final VariantContext vcFromAllelesRod = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), false, logger, UG_engine.getUAC().alleles); + if( vcFromAllelesRod != null ) { + return new ActivityProfileState(ref.getLocus(), 1.0); + } + } + + if( USE_ALLELES_TRIGGER ) { + return new ActivityProfileState( ref.getLocus(), tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ? 1.0 : 0.0 ); + } + + if( context == null || context.getBasePileup().isEmpty() ) + // if we don't have any data, just abort early + return new ActivityProfileState(ref.getLocus(), 0.0); + + final List noCall = Collections.singletonList(Allele.NO_CALL); // used to noCall all genotypes until the exact model is applied + final Map splitContexts = AlignmentContextUtils.splitContextBySampleName(context); + final GenotypesContext genotypes = GenotypesContext.create(splitContexts.keySet().size()); + final MathUtils.RunningAverage averageHQSoftClips = new MathUtils.RunningAverage(); + for( final Map.Entry sample : splitContexts.entrySet() ) { + final double[] genotypeLikelihoods = referenceConfidenceModel.calcGenotypeLikelihoodsOfRefVsAny(sample.getValue().getBasePileup(), ref.getBase(), MIN_BASE_QUALTY_SCORE, averageHQSoftClips).genotypeLikelihoods; + genotypes.add( new GenotypeBuilder(sample.getKey()).alleles(noCall).PL(genotypeLikelihoods).make() ); + } + + final List alleles = Arrays.asList(FAKE_REF_ALLELE , FAKE_ALT_ALLELE); + final VariantCallContext vcOut = UG_engine_simple_genotyper.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getStop(), alleles).genotypes(genotypes).make(), GenotypeLikelihoodsCalculationModel.Model.SNP); + final double isActiveProb = vcOut == null ? 0.0 : QualityUtils.qualToProb( vcOut.getPhredScaledQual() ); + + return new ActivityProfileState( ref.getLocus(), isActiveProb, averageHQSoftClips.mean() > 6.0 ? ActivityProfileState.Type.HIGH_QUALITY_SOFT_CLIPS : ActivityProfileState.Type.NONE, averageHQSoftClips.mean() ); + } + + //--------------------------------------------------------------------------------------------------------------- + // + // map + // + //--------------------------------------------------------------------------------------------------------------- + + private final static List NO_CALLS = Collections.emptyList(); + @Override + public List map( final ActiveRegion originalActiveRegion, final RefMetaDataTracker metaDataTracker ) { + if ( justDetermineActiveRegions ) + // we're benchmarking ART and/or the active region determination code in the HC, just leave without doing any work + return NO_CALLS; + + if( !originalActiveRegion.isActive() ) + // Not active so nothing to do! + return referenceModelForNoVariation(originalActiveRegion, true); + + final List activeAllelesToGenotype = new ArrayList<>(); + if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { + for ( final VariantContext vc : metaDataTracker.getValues(UG_engine.getUAC().alleles) ) { + if ( vc.isNotFiltered() ) { + activeAllelesToGenotype.add(vc); // do something with these VCs during GGA mode + } + } + // No alleles found in this region so nothing to do! + if ( activeAllelesToGenotype.isEmpty() ) { return referenceModelForNoVariation(originalActiveRegion, true); } + } else { + // No reads here so nothing to do! + if( originalActiveRegion.size() == 0 ) { return referenceModelForNoVariation(originalActiveRegion, true); } + } + + // run the local assembler, getting back a collection of information on how we should proceed + final AssemblyResultSet untrimmedAssemblyResult = assembleReads(originalActiveRegion, activeAllelesToGenotype); + + final TreeSet allVariationEvents = untrimmedAssemblyResult.getVariationEvents(); + // TODO - line bellow might be unecessary : it might be that assemblyResult will always have those alleles anyway + // TODO - so check and remove if that is the case: + allVariationEvents.addAll(activeAllelesToGenotype); + + final ActiveRegionTrimmer.Result trimmingResult = trimmer.trim(originalActiveRegion,allVariationEvents); + + if (!trimmingResult.isVariationPresent()) + return referenceModelForNoVariation(originalActiveRegion,false); + + final AssemblyResultSet assemblyResult = + trimmingResult.needsTrimming() ? untrimmedAssemblyResult.trimTo(trimmingResult.getCallableRegion()) : untrimmedAssemblyResult; + + final ActiveRegion regionForGenotyping = assemblyResult.getRegionForGenotyping(); + + // filter out reads from genotyping which fail mapping quality based criteria + //TODO - why don't do this before any assembly is done? Why not just once at the beginning of this method + //TODO - on the originalActiveRegion? + //TODO - if you move this up you might have to consider to change referenceModelForNoVariation + //TODO - that does also filter reads. + final Collection filteredReads = filterNonPassingReads( regionForGenotyping ); + final Map> perSampleFilteredReadList = splitReadsBySample( filteredReads ); + + // abort early if something is out of the acceptable range + // TODO is this ever true at this point??? perhaps GGA. Need to check. + if( ! assemblyResult.isVariationPresent() ) + return referenceModelForNoVariation(originalActiveRegion, false); + + // For sure this is not true if gVCF is on. + if (dontGenotype) return NO_CALLS; // user requested we not proceed + + + // TODO is this ever true at this point??? perhaps GGA. Need to check. + if( regionForGenotyping.size() == 0 ) { + // no reads remain after filtering so nothing else to do! + return referenceModelForNoVariation(originalActiveRegion, false); + } + + // evaluate each sample's reads against all haplotypes + //logger.info("Computing read likelihoods with " + assemblyResult.regionForGenotyping.size() + " reads"); + final List haplotypes = assemblyResult.getHaplotypeList(); + final Map> reads = splitReadsBySample( regionForGenotyping.getReads() ); + + // Calculate the likelihoods: CPU intesive part. + final Map stratifiedReadMap = + likelihoodCalculationEngine.computeReadLikelihoods(assemblyResult,reads); + + // Note: we used to subset down at this point to only the "best" haplotypes in all samples for genotyping, but there + // was a bad interaction between that selection and the marginalization that happens over each event when computing + // GLs. In particular, for samples that are heterozygous non-reference (B/C) the marginalization for B treats the + // haplotype containing C as reference (and vice versa). Now this is fine if all possible haplotypes are included + // in the genotyping, but we lose information if we select down to a few haplotypes. [EB] + + final GenotypingEngine.CalledHaplotypes calledHaplotypes = genotypingEngine.assignGenotypeLikelihoods( UG_engine, + haplotypes, + stratifiedReadMap, + perSampleFilteredReadList, + assemblyResult.getFullReferenceWithPadding(), + assemblyResult.getPaddedReferenceLoc(), + regionForGenotyping.getLocation(), + getToolkit().getGenomeLocParser(), + metaDataTracker, + activeAllelesToGenotype, emitReferenceConfidence() ); + + // TODO -- must disable if we are doing NCT, or set the output type of ! presorted + if ( bamWriter != null ) { + haplotypeBAMWriter.writeReadsAlignedToHaplotypes( + haplotypes, + assemblyResult.getPaddedReferenceLoc(), + haplotypes, + calledHaplotypes.getCalledHaplotypes(), + stratifiedReadMap); + } + + if( DEBUG ) { logger.info("----------------------------------------------------------------------------------"); } + + + if ( emitReferenceConfidence() ) { + if ( !containsCalls(calledHaplotypes) ) { + // no called all of the potential haplotypes + return referenceModelForNoVariation(originalActiveRegion, false); + } else { + final List result = new LinkedList<>(); + // output left-flanking non-variant section: + if (trimmingResult.hasLeftFlankingRegion()) + result.addAll(referenceModelForNoVariation(trimmingResult.nonVariantLeftFlankRegion(),false)); + // output variant containing region. + result.addAll(referenceConfidenceModel.calculateRefConfidence(assemblyResult.getReferenceHaplotype(), + calledHaplotypes.getCalledHaplotypes(), assemblyResult.getPaddedReferenceLoc(), regionForGenotyping, + stratifiedReadMap, calledHaplotypes.getCalls())); + // output right-flanking non-variant section: + if (trimmingResult.hasRightFlankingRegion()) + result.addAll(referenceModelForNoVariation(trimmingResult.nonVariantRightFlankRegion(),false)); + return result; + } + } else { + return calledHaplotypes.getCalls(); + } + } + + private boolean containsCalls(final GenotypingEngine.CalledHaplotypes calledHaplotypes) { + final List calls = calledHaplotypes.getCalls(); + if (calls.isEmpty()) return false; + for (final VariantContext call : calls) + for (final Genotype genotype : call.getGenotypes()) + if (genotype.isCalled()) + return true; + return false; + } + + /** + * High-level function that runs the assembler on the active region reads, + * returning a data structure with the resulting information needed + * for further HC steps + * + * @param activeRegion the region we should assemble + * @param activeAllelesToGenotype additional alleles we might need to genotype (can be empty) + * @return the AssemblyResult describing how to proceed with genotyping + */ + protected AssemblyResultSet assembleReads(final ActiveRegion activeRegion, final List activeAllelesToGenotype) { + // Create the reference haplotype which is the bases from the reference that make up the active region + finalizeActiveRegion(activeRegion); // handle overlapping fragments, clip adapter and low qual tails + + final byte[] fullReferenceWithPadding = activeRegion.getActiveRegionReference(referenceReader, REFERENCE_PADDING); + final GenomeLoc paddedReferenceLoc = getPaddedLoc(activeRegion); + final Haplotype referenceHaplotype = createReferenceHaplotype(activeRegion, paddedReferenceLoc); + + // Create ReadErrorCorrector object if requested - will be used within assembly engine. + ReadErrorCorrector readErrorCorrector = null; + if (errorCorrectReads) + readErrorCorrector = new ReadErrorCorrector(kmerLengthForReadErrorCorrection, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION, minObservationsForKmerToBeSolid, DEBUG, fullReferenceWithPadding); + + try { + final AssemblyResultSet assemblyResultSet = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype,readErrorCorrector ); + assemblyResultSet.debugDump(logger); + return assemblyResultSet; + + } catch ( final Exception e ) { + // Capture any exception that might be thrown, and write out the assembly failure BAM if requested + if ( captureAssemblyFailureBAM ) { + final SAMFileWriter writer = ReadUtils.createSAMFileWriterWithCompression(getToolkit().getSAMFileHeader(), true, "assemblyFailure.bam", 5); + for ( final GATKSAMRecord read : activeRegion.getReads() ) { + writer.addAlignment(read); + } + writer.close(); + } + throw e; + } + } + + /** + * Helper function to create the reference haplotype out of the active region and a padded loc + * @param activeRegion the active region from which to generate the reference haplotype + * @param paddedReferenceLoc the GenomeLoc which includes padding and shows how big the reference haplotype should be + * @return a non-null haplotype + */ + private Haplotype createReferenceHaplotype(final ActiveRegion activeRegion, final GenomeLoc paddedReferenceLoc) { + return ReferenceConfidenceModel.createReferenceHaplotype(activeRegion, activeRegion.getActiveRegionReference(referenceReader), paddedReferenceLoc); + } + + /** + * Create an ref model result (ref model or no calls depending on mode) for an active region without any variation + * (not is active, or assembled to just ref) + * + * @param region the region to return a no-variation result + * @param needsToBeFinalized should the region be finalized before computing the ref model (should be false if already done) + * @return a list of variant contexts (can be empty) to emit for this ref region + */ + private List referenceModelForNoVariation(final ActiveRegion region, final boolean needsToBeFinalized) { + if ( emitReferenceConfidence() ) { + //TODO - why the activeRegion cannot manage its own one-time finalization and filtering? + //TODO - perhaps we can remove the last parameter of this method and the three lines bellow? + if ( needsToBeFinalized ) + finalizeActiveRegion(region); + filterNonPassingReads(region); + + final GenomeLoc paddedLoc = region.getExtendedLoc(); + final Haplotype refHaplotype = createReferenceHaplotype(region, paddedLoc); + final List haplotypes = Collections.singletonList(refHaplotype); + return referenceConfidenceModel.calculateRefConfidence(refHaplotype, haplotypes, + paddedLoc, region, createDummyStratifiedReadMap(refHaplotype, samplesList, region), + Collections.emptyList()); + } else + return NO_CALLS; + } + + /** + * Create a context that maps each read to the reference haplotype with log10 L of 0 + * @param refHaplotype a non-null reference haplotype + * @param samples a list of all samples + * @param region the active region containing reads + * @return a map from sample -> PerReadAlleleLikelihoodMap that maps each read to ref + */ + public static Map createDummyStratifiedReadMap(final Haplotype refHaplotype, + final List samples, + final ActiveRegion region) { + final Allele refAllele = Allele.create(refHaplotype, true); + + final Map map = new LinkedHashMap<>(1); + for ( final Map.Entry> entry : splitReadsBySample(samples, region.getReads()).entrySet() ) { + final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap(); + for ( final GATKSAMRecord read : entry.getValue() ) { + likelihoodMap.add(read, refAllele, 0.0); + } + map.put(entry.getKey(), likelihoodMap); + } + + return map; + } + + //--------------------------------------------------------------------------------------------------------------- + // + // reduce + // + //--------------------------------------------------------------------------------------------------------------- + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(List callsInRegion, Integer numCalledRegions) { + for( final VariantContext call : callsInRegion ) { + vcfWriter.add( call ); + } + return (callsInRegion.isEmpty() ? 0 : 1) + numCalledRegions; + } + + @Override + public void onTraversalDone(Integer result) { + if ( emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) ((GVCFWriter)vcfWriter).close(false); // GROSS -- engine forces us to close our own VCF writer since we wrapped it + referenceConfidenceModel.close(); + //TODO remove the need to call close here for debugging, the likelihood output stream should be managed + //TODO (open & close) at the walker, not the engine. + likelihoodCalculationEngine.close(); + logger.info("Ran local assembly on " + result + " active regions"); + } + + //--------------------------------------------------------------------------------------------------------------- + // + // private helper functions + // + //--------------------------------------------------------------------------------------------------------------- + + private void finalizeActiveRegion( final ActiveRegion activeRegion ) { + if (activeRegion.isFinalized()) return; + + if( DEBUG ) { logger.info("Assembling " + activeRegion.getLocation() + " with " + activeRegion.size() + " reads: (with overlap region = " + activeRegion.getExtendedLoc() + ")"); } + + // Loop through the reads hard clipping the adaptor and low quality tails + final List readsToUse = new ArrayList<>(activeRegion.getReads().size()); + for( final GATKSAMRecord myRead : activeRegion.getReads() ) { + GATKSAMRecord clippedRead; + if (errorCorrectReads) + clippedRead = ReadClipper.hardClipLowQualEnds( myRead, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION ); + else // default case: clip low qual ends of reads + clippedRead= ReadClipper.hardClipLowQualEnds( myRead, MIN_TAIL_QUALITY ); + + if ( dontUseSoftClippedBases || ! ReadUtils.hasWellDefinedFragmentSize(clippedRead) ) { + // remove soft clips if we cannot reliably clip off adapter sequence or if the user doesn't want to use soft clips at all + clippedRead = ReadClipper.hardClipSoftClippedBases(clippedRead); + } else { + // revert soft clips so that we see the alignment start and end assuming the soft clips are all matches + // TODO -- WARNING -- still possibility that unclipping the soft clips will introduce bases that aren't + // TODO -- truly in the extended region, as the unclipped bases might actually include a deletion + // TODO -- w.r.t. the reference. What really needs to happen is that kmers that occur before the + // TODO -- reference haplotype start must be removed + clippedRead = ReadClipper.revertSoftClippedBases(clippedRead); + } + + clippedRead = ( clippedRead.getReadUnmappedFlag() ? clippedRead : ReadClipper.hardClipAdaptorSequence( clippedRead ) ); + if( !clippedRead.isEmpty() && clippedRead.getCigar().getReadLength() > 0 ) { + clippedRead = ReadClipper.hardClipToRegion( clippedRead, activeRegion.getExtendedLoc().getStart(), activeRegion.getExtendedLoc().getStop() ); + if( activeRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) { + //logger.info("Keeping read " + clippedRead + " start " + clippedRead.getAlignmentStart() + " end " + clippedRead.getAlignmentEnd()); + readsToUse.add(clippedRead); + } + } + } + + // TODO -- Performance optimization: we partition the reads by sample 4 times right now; let's unify that code. + + final List downsampledReads = DownsamplingUtils.levelCoverageByPosition(ReadUtils.sortReadsByCoordinate(readsToUse), maxReadsInRegionPerSample, minReadsPerAlignmentStart); + + // handle overlapping read pairs from the same fragment + cleanOverlappingReadPairs(downsampledReads); + + activeRegion.clearReads(); + activeRegion.addAll(downsampledReads); + activeRegion.setFinalized(true); + } + + private Set filterNonPassingReads( final ActiveRegion activeRegion ) { + final Set readsToRemove = new LinkedHashSet<>(); + for( final GATKSAMRecord rec : activeRegion.getReads() ) { + if( rec.getReadLength() < MIN_READ_LENGTH || rec.getMappingQuality() < 20 || BadMateFilter.hasBadMate(rec) || (keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) { + readsToRemove.add(rec); + } + } + activeRegion.removeAll( readsToRemove ); + return readsToRemove; + } + + private GenomeLoc getPaddedLoc( final ActiveRegion activeRegion ) { + final int padLeft = Math.max(activeRegion.getExtendedLoc().getStart()-REFERENCE_PADDING, 1); + final int padRight = Math.min(activeRegion.getExtendedLoc().getStop()+REFERENCE_PADDING, referenceReader.getSequenceDictionary().getSequence(activeRegion.getExtendedLoc().getContig()).getSequenceLength()); + return getToolkit().getGenomeLocParser().createGenomeLoc(activeRegion.getExtendedLoc().getContig(), padLeft, padRight); + } + + private Map> splitReadsBySample( final Collection reads ) { + return splitReadsBySample(samplesList, reads); + } + + public static Map> splitReadsBySample( final List samplesList, final Collection reads ) { + final Map> returnMap = new HashMap<>(); + for( final String sample : samplesList) { + List readList = returnMap.get( sample ); + if( readList == null ) { + readList = new ArrayList<>(); + returnMap.put(sample, readList); + } + } + for( final GATKSAMRecord read : reads ) { + returnMap.get(read.getReadGroup().getSample()).add(read); + } + + return returnMap; + } + + /** + * Are we emitting a reference confidence in some form, or not? + * @return true if we are + */ + private boolean emitReferenceConfidence(){ + return emitReferenceConfidence != ReferenceConfidenceMode.NONE; + } + + /** + * Clean up reads/bases that overlap within read pairs + * + * @param reads the list of reads to consider + */ + private void cleanOverlappingReadPairs(final List reads) { + for ( final List perSampleReadList : splitReadsBySample(reads).values() ) { + final FragmentCollection fragmentCollection = FragmentUtils.create(perSampleReadList); + for ( final List overlappingPair : fragmentCollection.getOverlappingPairs() ) + FragmentUtils.adjustQualsOfOverlappingPairedFragments(overlappingPair); + } + } +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java new file mode 100644 index 000000000..cfd07da67 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java @@ -0,0 +1,467 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Input; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.Reference; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.Window; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.variant.vcf.VCFHeader; +import org.broadinstitute.variant.vcf.VCFHeaderLine; +import org.broadinstitute.variant.vcf.VCFHeaderLineType; +import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.VariantContextBuilder; +import org.broadinstitute.variant.variantcontext.VariantContextUtils; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriterFactory; + +import java.util.*; + +/** + * Haplotype-based resolution of variants in 2 different eval files. + * + *

+ * HaplotypeResolver is a tool that takes 2 VCF files and constructs haplotypes based on the variants inside them. + * From that, it can resolve potential differences in variant calls that are inherently the same (or similar) variants. + * Records are annotated with the set and status attributes. + * + *

Input

+ *

+ * 2 variant files to resolve. + *

+ * + *

Output

+ *

+ * A single consensus VCF. + *

+ * + *

Examples

+ *
+ * java -Xmx1g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T HaplotypeResolver \
+ *   -V:v1 input1.vcf \
+ *   -V:v2 input2.vcf \
+ *   -o output.vcf
+ * 
+ * + */ +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) +@Reference(window=@Window(start=-HaplotypeResolver.ACTIVE_WINDOW,stop= HaplotypeResolver.ACTIVE_WINDOW)) +public class HaplotypeResolver extends RodWalker { + + protected static final String INTERSECTION_SET = "intersection"; + protected static final String SAME_STATUS = "same"; + protected static final String SOME_ALLELES_MATCH_STATUS = "someAllelesMatch"; + protected static final String SAME_START_DIFFERENT_ALLELES_STATUS = "sameStartDifferentAlleles"; + protected static final String SAME_BY_HAPLOTYPE_STATUS = "sameByHaplotype"; + protected static final String ONE_ALLELE_SUBSET_OF_OTHER_STATUS = "OneAlleleSubsetOfOther"; + protected static final String OVERLAPPING_EVENTS_STATUS = "overlappingEvents"; + + protected final static int MAX_DISTANCE_BETWEEN_MERGED_RECORDS = 50; + protected final static int MAX_HAPLOTYPE_TO_CONSIDER = 1000; + protected final static int MAX_VARIANT_SIZE_TO_CONSIDER = 100; + protected final static int ACTIVE_WINDOW = MAX_HAPLOTYPE_TO_CONSIDER + MAX_VARIANT_SIZE_TO_CONSIDER; + + @Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true) + public List> variants; + + @Output(doc="File to which variants should be written") + protected VariantContextWriter baseWriter = null; + private VariantContextWriter writer; + + /** + * Set to 'null' if you don't want the set field emitted. + */ + @Argument(fullName="setKey", shortName="setKey", doc="Key used in the INFO key=value tag emitted describing which set the combined VCF record came from", required=false) + protected String SET_KEY = "set"; + + /** + * Set to 'null' if you don't want the status field emitted. + */ + @Argument(fullName="statusKey", shortName="statusKey", doc="Key used in the INFO key=value tag emitted describing the extent to which records match", required=false) + protected String STATUS_KEY = "status"; + + private final LinkedList queue = new LinkedList(); + private String source1, source2; + private final List sourceVCs1 = new ArrayList(); + private final List sourceVCs2 = new ArrayList(); + + + private class VCcontext { + public final Collection vcs; + public final GenomeLoc loc; + public final ReferenceContext ref; + + public VCcontext(final Collection vcs, final ReferenceContext ref) { + this.vcs = vcs; + this.loc = getToolkit().getGenomeLocParser().createGenomeLoc(vcs.iterator().next()); + this.ref = ref; + } + } + + public void initialize() { + + if ( variants.size() != 2 ) { + throw new UserException.BadArgumentValue("variant", "this tool requires exactly 2 input variant files"); + } + source1 = variants.get(0).getName(); + source2 = variants.get(1).getName(); + + if ( SET_KEY.toLowerCase().equals("null") ) + SET_KEY = null; + if ( STATUS_KEY.toLowerCase().equals("null") ) + STATUS_KEY = null; + + // for now, INFO and FORMAT fields are not propagated to the output VCF (so they aren't put into the header) + Set headerLines = new HashSet(); + if ( SET_KEY != null ) + headerLines.add(new VCFInfoHeaderLine(SET_KEY, 1, VCFHeaderLineType.String, "Source VCF for the merged record")); + if ( STATUS_KEY != null ) + headerLines.add(new VCFInfoHeaderLine(STATUS_KEY, 1, VCFHeaderLineType.String, "Extent to which records match")); + final VCFHeader vcfHeader = new VCFHeader(headerLines, Collections.emptySet()); + baseWriter.writeHeader(vcfHeader); + writer = VariantContextWriterFactory.sortOnTheFly(baseWriter, ACTIVE_WINDOW); + } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) + return 0; + + final Collection VCs = tracker.getValues(variants, context.getLocation()); + if ( VCs.size() == 0 ) + return 0; + + final VCcontext vc = new VCcontext(VariantContextUtils.sitesOnlyVariantContexts(VCs), ref); + + // TODO -- what should we do about filtered records? + + if ( !queue.isEmpty() ) { + + final VCcontext previous = queue.getLast(); + if ( !previous.loc.onSameContig(vc.loc) || + previous.loc.distance(vc.loc) > MAX_DISTANCE_BETWEEN_MERGED_RECORDS || + queue.getFirst().loc.distance(vc.loc) > MAX_HAPLOTYPE_TO_CONSIDER ) { + purgeQueue(); + } + } + + queue.addLast(vc); + return 0; + } + + public Integer reduceInit() { return 0; } + + public Integer reduce(Integer value, Integer sum) { + return sum + value; + } + + public void onTraversalDone(Integer result) { + if ( !queue.isEmpty() ) + purgeQueue(); + writer.close(); + } + + private void purgeQueue() { + + final ReferenceContext refContext = queue.getFirst().ref; + + // divide them up by source + while ( !queue.isEmpty() ) { + VCcontext context = queue.removeFirst(); + for ( final VariantContext vc: context.vcs ) { + if ( vc.getSource().equals(source1) ) + sourceVCs1.add(vc); + else + sourceVCs2.add(vc); + } + } + + writeAndPurgeAllEqualVariants(sourceVCs1, sourceVCs2, SAME_STATUS); + + if ( sourceVCs1.isEmpty() ) { + writeAll(sourceVCs2, source2, null); + } else if ( sourceVCs2.isEmpty() ) { + writeAll(sourceVCs1, source1, null); + } else { + resolveByHaplotype(refContext); + } + + // allow for GC of the data + sourceVCs1.clear(); + sourceVCs2.clear(); + } + + private void writeAll(final List sourceVCs, final String set, final String status) { + for ( final VariantContext vc : sourceVCs ) { + writeOne(vc, set, status); + } + } + + private void writeOne(final VariantContext vc, final String set, final String status) { + final Map attrs = new HashMap<>(); + if ( SET_KEY != null && set != null ) + attrs.put(SET_KEY, set); + if ( STATUS_KEY != null && status != null ) + attrs.put(STATUS_KEY, status); + writer.add(new VariantContextBuilder(vc).attributes(attrs).make()); + } + + private void writeAndPurgeAllEqualVariants(final List sourceVCs1, final List sourceVCs2, final String status) { + + int currentIndex1 = 0, currentIndex2 = 0; + int size1 = sourceVCs1.size(), size2 = sourceVCs2.size(); + VariantContext current1 = (currentIndex1 < size1 ? sourceVCs1.get(currentIndex1): null); + VariantContext current2 = (currentIndex2 < size2 ? sourceVCs2.get(currentIndex2): null); + + while ( current1 != null && current2 != null ) { + + final GenomeLoc loc1 = getToolkit().getGenomeLocParser().createGenomeLoc(current1); + final GenomeLoc loc2 = getToolkit().getGenomeLocParser().createGenomeLoc(current2); + + if ( loc1.equals(loc2) || + (loc1.getStart() == loc2.getStart() && (current1.getAlternateAlleles().size() > 1 || current2.getAlternateAlleles().size() > 1)) ) { + // test the alleles + if ( determineAndWriteOverlap(current1, current2, status) ) { + sourceVCs1.remove(currentIndex1); + sourceVCs2.remove(currentIndex2); + size1--; + size2--; + } else { + currentIndex1++; + currentIndex2++; + } + current1 = (currentIndex1 < size1 ? sourceVCs1.get(currentIndex1): null); + current2 = (currentIndex2 < size2 ? sourceVCs2.get(currentIndex2): null); + } else if ( loc1.isBefore(loc2) ) { + currentIndex1++; + current1 = (currentIndex1 < size1 ? sourceVCs1.get(currentIndex1): null); + } else { + currentIndex2++; + current2 = (currentIndex2 < size2 ? sourceVCs2.get(currentIndex2): null); + } + } + } + + private boolean determineAndWriteOverlap(final VariantContext vc1, final VariantContext vc2, final String status) { + final int allelesFrom1In2 = findOverlap(vc1, vc2); + final int allelesFrom2In1 = findOverlap(vc2, vc1); + final int totalAllelesIn1 = vc1.getAlternateAlleles().size(); + final int totalAllelesIn2 = vc2.getAlternateAlleles().size(); + + final boolean allAllelesFrom1Overlap = allelesFrom1In2 == totalAllelesIn1; + final boolean allAllelesFrom2Overlap = allelesFrom2In1 == totalAllelesIn2; + + boolean thereIsOverlap = true; + + if ( allAllelesFrom1Overlap && allAllelesFrom2Overlap ) { + writeOne(vc1, INTERSECTION_SET, status); + } else if ( allAllelesFrom1Overlap ) { + writeOne(vc2, INTERSECTION_SET, source1 + "IsSubsetOf" + source2); + } else if ( allAllelesFrom2Overlap ) { + writeOne(vc1, INTERSECTION_SET, source2 + "IsSubsetOf" + source1); + } else if ( allelesFrom1In2 > 0 ) { + writeOne(vc1, INTERSECTION_SET, SOME_ALLELES_MATCH_STATUS); + } else if ( totalAllelesIn1 > 1 || totalAllelesIn2 > 1 ) { // we don't handle multi-allelics in the haplotype-based reconstruction + writeOne(vc1, INTERSECTION_SET, SAME_START_DIFFERENT_ALLELES_STATUS); + } else { + thereIsOverlap = false; + } + + return thereIsOverlap; + } + + private static int findOverlap(final VariantContext target, final VariantContext comparison) { + int overlap = 0; + for ( final Allele allele : target.getAlternateAlleles() ) { + if ( comparison.hasAlternateAllele(allele) ) + overlap++; + } + return overlap; + } + + private static final double SW_MATCH = 4.0; + private static final double SW_MISMATCH = -10.0; + private static final double SW_GAP = -25.0; + private static final double SW_GAP_EXTEND = -1.3; + private void resolveByHaplotype(final ReferenceContext refContext) { + + final byte[] source1Haplotype = generateHaplotype(sourceVCs1, refContext); + final byte[] source2Haplotype = generateHaplotype(sourceVCs2, refContext); + + final SWPairwiseAlignment swConsensus1 = new SWPairwiseAlignment( refContext.getBases(), source1Haplotype, SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND ); + final SWPairwiseAlignment swConsensus2 = new SWPairwiseAlignment( refContext.getBases(), source2Haplotype, SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND ); + + // protect against SW failures + if( swConsensus1.getCigar().toString().contains("S") || swConsensus1.getCigar().getReferenceLength() < 20 || + swConsensus2.getCigar().toString().contains("S") || swConsensus2.getCigar().getReferenceLength() < 20 ) { + // TODO -- handle errors appropriately + logger.debug("Bad SW alignment; aborting at " + refContext.getLocus()); + return; + } + + // order results by start position + final TreeMap source1Map = new TreeMap(GenotypingEngine.generateVCsFromAlignment(new Haplotype(source1Haplotype, false, 0, swConsensus1.getCigar()), refContext.getBases(), refContext.getWindow(), source1)); + final TreeMap source2Map = new TreeMap(GenotypingEngine.generateVCsFromAlignment(new Haplotype(source2Haplotype, false, 0, swConsensus2.getCigar()), refContext.getBases(), refContext.getWindow(), source2)); + if ( source1Map.size() == 0 || source2Map.size() == 0 ) { + // TODO -- handle errors appropriately + logger.debug("No source alleles; aborting at " + refContext.getLocus()); + return; + } + + // create lists and test for equality + final List source1Alleles = new ArrayList(source1Map.values()); + final List source2Alleles = new ArrayList(source2Map.values()); + + writeAndPurgeAllEqualVariants(source1Alleles, source2Alleles, SAME_BY_HAPLOTYPE_STATUS); + if ( source1Alleles.isEmpty() ) { + writeAll(source2Alleles, source2, null); + } else if ( source2Alleles.isEmpty() ) { + writeAll(source1Alleles, source1, null); + } else { + writeDifferences(source1Alleles, source2Alleles); + } + } + + private byte[] generateHaplotype(final List sourceVCs, final ReferenceContext refContext) { + + final StringBuilder sb = new StringBuilder(); + + final int startPos = refContext.getWindow().getStart(); + int currentPos = startPos; + final byte[] reference = refContext.getBases(); + + for ( final VariantContext vc : sourceVCs ) { + // add any missing reference context + int vcStart = vc.getStart(); + final int refAlleleLength = vc.getReference().length(); + if ( refAlleleLength == vc.getEnd() - vc.getStart() ) // this is a deletion (whereas for other events the padding base isn't part of the position) + vcStart++; + + while ( currentPos < vcStart ) + sb.append((char)reference[currentPos++ - startPos]); + + // add the alt allele + sb.append(vc.getAlternateAllele(0).getBaseString()); + + // skip the reference allele + currentPos += refAlleleLength; + } + // add any missing reference context + final int stopPos = refContext.getWindow().getStop(); + while ( currentPos < stopPos ) + sb.append((char)reference[currentPos++ - startPos]); + + return sb.toString().getBytes(); + } + + private void writeDifferences(final List source1Alleles, final List source2Alleles) { + int currentIndex1 = 0, currentIndex2 = 0; + final int size1 = source1Alleles.size(), size2 = source2Alleles.size(); + VariantContext current1 = source1Alleles.get(0); + VariantContext current2 = source2Alleles.get(0); + + while ( currentIndex1 < size1 || currentIndex2 < size2 ) { + if ( current1 == null ) { + writeOne(current2, source2, null); + currentIndex2++; + current2 = (currentIndex2 < size2 ? source2Alleles.get(currentIndex2): null); + } else if ( current2 == null ) { + writeOne(current1, source1, null); + currentIndex1++; + current1 = (currentIndex1 < size1 ? source1Alleles.get(currentIndex1): null); + } else { + + final GenomeLoc loc1 = getToolkit().getGenomeLocParser().createGenomeLoc(current1); + final GenomeLoc loc2 = getToolkit().getGenomeLocParser().createGenomeLoc(current2); + + if ( loc1.getStart() == loc2.getStart() || loc1.overlapsP(loc2) ) { + String status; + if ( loc1.getStart() == loc2.getStart() ) { + final String allele1 = current1.getAlternateAllele(0).getBaseString(); + final String allele2 = current2.getAlternateAllele(0).getBaseString(); + if ( allele1.indexOf(allele2) != -1 || allele2.indexOf(allele1) != -1 ) + status = ONE_ALLELE_SUBSET_OF_OTHER_STATUS; + else + status = SAME_START_DIFFERENT_ALLELES_STATUS; + } else { + status = OVERLAPPING_EVENTS_STATUS; + } + + writeOne(current1, INTERSECTION_SET, status); + currentIndex1++; + currentIndex2++; + current1 = (currentIndex1 < size1 ? source1Alleles.get(currentIndex1): null); + current2 = (currentIndex2 < size2 ? source2Alleles.get(currentIndex2): null); + } else if ( loc1.isBefore(loc2) ) { + writeOne(current1, source1, null); + currentIndex1++; + current1 = (currentIndex1 < size1 ? source1Alleles.get(currentIndex1): null); + } else { + writeOne(current2, source2, null); + currentIndex2++; + current2 = (currentIndex2 < size2 ? source2Alleles.get(currentIndex2): null); + } + } + } + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeRoute.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeRoute.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeRoute.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeRoute.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HeterogeneousKmerSizeResolution.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HeterogeneousKmerSizeResolution.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HeterogeneousKmerSizeResolution.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HeterogeneousKmerSizeResolution.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounter.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounter.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounter.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounter.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Kmer.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Kmer.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Kmer.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Kmer.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerSequence.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerSequence.java new file mode 100644 index 000000000..102562504 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerSequence.java @@ -0,0 +1,452 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.haplotype.Haplotype; + +import java.lang.reflect.Array; +import java.util.*; + +/** + * Represent a sequence of kmers where any two consecutive kmers overlap in kmer length - 1 elements. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.com> + */ +public class KmerSequence implements List { + private final byte[] sequence; + private final int start; + private final int size; + private final int kmerSize; + private final int rawLength; + + /** + * Creates a kmer sequence from a read's sequence. + * + * @param read the read to represent as a sequence of kmers. + * @param kmerSize the kmer size. + */ + public KmerSequence(final SAMRecord read, final int kmerSize) { + this(read.getReadBases(), kmerSize); + } + + /** + * Creates a kmer sequence from a haplotype's sequence. + * + * @param hap the haplotype to represent as a sequence of kmers. + * @param kmerSize the kmer size. + */ + @SuppressWarnings("unused") + public KmerSequence(final Haplotype hap, final int kmerSize) { + this(hap.getBases(), kmerSize); + } + + /** + * Creates a kmer sequence out of a byte sequence. + * + * @param sequence the byte array to represent as a kmer sequence. + * @param kmerSize the kmer size. + */ + public KmerSequence(final byte[] sequence, final int kmerSize) { + this(sequence,0,Math.max(0,sequence.length - kmerSize + 1),kmerSize, sequence.length); + } + + /** + * Creates a kmer sequence out of a range of a byte array + * + * @param sequence the input array. + * @param start inclusive first position of the array that maps to the first position in the first kmer. + * @param size number kmers in the output. + * @param kmerSize kmer length in bases. + * @param rawLength the of the range in bases. + */ + protected KmerSequence(final byte[] sequence, final int start, final int size, final int kmerSize, final int rawLength) { + if (sequence == null) { + throw new IllegalArgumentException("start must be 0 or greater"); + } + if (rawLength > sequence.length - start) { + throw new IllegalArgumentException("the raw sequence length goes beyond the array capacity"); + } + if (size < 0) { + throw new IllegalArgumentException("the length cannot be negative"); + } + if (start < 0) { + throw new IllegalArgumentException("start must be 0 or greater"); + } + if (size > 0 && size + kmerSize - 1 > rawLength) { + throw new IllegalArgumentException( + String.format("the kmerSize (%d) + size (%d) - 1 cannot be larger than rawLength (%d)",kmerSize,size,rawLength) ); + } + this.sequence = sequence; + this.start = start; + this.size = size; + this.kmerSize = kmerSize; + this.rawLength = rawLength; + } + + public int kmerSize() { + return kmerSize; + } + + public KmerSequence subsequence(final int from, final int to) { + if (from < 0 || from > to) { + throw new IllegalArgumentException(); + } + if (to > size) { + throw new IllegalArgumentException(); + } + return new KmerSequence(sequence,this.start + from,to - from,kmerSize,rawLength - from - (size - to)); + } + + + @Override + public int size() { + return size; + } + + @Override + public boolean isEmpty() { + return size == 0; + } + + @Override + public boolean contains(final Object o) { + if (o instanceof Kmer) { + if (o instanceof MyKmer) { + final MyKmer k = (MyKmer) o; + if (k.bases == sequence && k.start >= start && k.length == kmerSize && k.start < start + size) { + return true; + } + } + final Kmer k = (Kmer) o; + if (k.length != kmerSize) { + return false; + } + for (int i = 0; i < size; i++) { + int j; + for (j = 0; j < kmerSize; j++) { + if (sequence[start + i + j] != k.bases[k.start + j]) { + break; + } + } + if (j == kmerSize) { + return true; + } + } + return false; + } else { + return false; + } + } + + @Override + public Iterator iterator() { + return new Iterator() { + + private int offset = 0; + + @Override + public boolean hasNext() { + return offset < size; + } + + @Override + public Kmer next() { + return new Kmer(sequence,start + offset,kmerSize); + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + + @Override + public Object[] toArray() { + return toArray(new Kmer[size()]); + } + + @Override + @SuppressWarnings("unchecked") + public T[] toArray(final T[] a) { + if (a == null) { + throw new IllegalArgumentException(); + } else if (!a.getClass().getComponentType().isAssignableFrom(Kmer.class)) { + throw new IllegalArgumentException(); + } else { + T[] result; + if (a.length < size) { + result = (T[]) Array.newInstance(a.getClass().getComponentType(), size); + } else { + result = a; + } + for (int i = 0; i < size; i++) { + result[i] = (T) new Kmer(sequence,start + i,kmerSize); + } + return result; + } + } + + @Override + public boolean add(final Kmer kmer) { + throw new UnsupportedOperationException(); + } + + @Override + public boolean remove(final Object o) { + throw new UnsupportedOperationException(); + } + + @Override + public boolean containsAll(final Collection c) { + for (final Object o : c) + if (!contains(o)) + return false; + return true; + } + + @Override + public boolean addAll(final Collection c) { + throw new UnsupportedOperationException(); + } + + @Override + public boolean addAll(final int index, final Collection c) { + throw new UnsupportedOperationException(); + } + + @Override + public boolean removeAll(final Collection c) { + throw new UnsupportedOperationException(); + } + + @Override + public boolean retainAll(final Collection c) { + throw new UnsupportedOperationException(); + } + + @Override + public void clear() { + throw new UnsupportedOperationException(); + } + + @Override + public Kmer get(final int index) { + if (index < 0 || index >= size) { + throw new IllegalArgumentException(); + } + return new Kmer(sequence,start + index,kmerSize); + } + + @Override + public Kmer set(final int index, final Kmer element) { + throw new UnsupportedOperationException(); + } + + @Override + public void add(final int index, final Kmer element) { + throw new UnsupportedOperationException(); + } + + @Override + public Kmer remove(final int index) { + throw new UnsupportedOperationException(); + } + + @Override + public int indexOf(final Object o) { + if (o instanceof Kmer) { + final Kmer k = (Kmer) o; + if (k.length != kmerSize) { + return -1; + } + for (int i = 0; i < size; i++) { + int j; + for (j = 0; j < kmerSize; j++) { + if (sequence[start + i + j] != k.bases[k.start + j]) { + break; + } + } + if (j == kmerSize) { + return i; + } + } + return -1; + } else { + return -1; + } + } + + @Override + public int lastIndexOf(final Object o) { + if (o instanceof Kmer) { + final Kmer k = (Kmer) o; + if (k.length != kmerSize) { + return -1; + } + for (int i = size - 1; i >= 0; i--) { + int j; + for (j = kmerSize - 1; j >= 0; j--) { + if (sequence[start + i + j] != k.bases[k.start + j]) { + break; + } + } + if (j == 0) { + return i; + } + } + return -1; + } else { + return -1; + } + } + + @Override + public ListIterator listIterator() { + return new MyListIterator(0); + } + + @Override + public ListIterator listIterator(final int index) { + return new MyListIterator(index); + } + + @Override + public List subList(final int fromIndex, final int toIndex) { + return subsequence(fromIndex,toIndex); + } + + /** + * Returns the byte array representation of the kmer sequence. + * @return never {@code null}. + */ + public byte[] getBytes() { + if (start == 0 && rawLength == sequence.length) + return sequence; + else + return Arrays.copyOfRange(sequence, start, rawLength + start); + } + + /** + * Internal class that implements the {@link Kmer} more efficiently + * making reference to the sequence's own byte array. + */ + protected class MyKmer extends Kmer { + + /** + * Create a new instance give the offset in the byte array. + * @param start the start base offset for the kmer. + */ + public MyKmer(final int start) { + super(sequence,start,kmerSize); + } + } + + /** + * Iterator implementation of Kmer elements. + */ + private class MyListIterator implements ListIterator { + + private int i = 0; + + /** + * Creates a iterator at certain offset in the sequence. + * @param idx the start position or kmer offset. + */ + private MyListIterator(final int idx) { + i = idx; + } + + @Override + public boolean hasNext() { + return i < size; + } + + @Override + public Kmer next() { + return new Kmer(sequence,start + i++,kmerSize); + } + + @Override + public boolean hasPrevious() { + return i > 0; + } + + @Override + public Kmer previous() { + return new Kmer(sequence,start + --i,kmerSize); + } + + @Override + public int nextIndex() { + return i; + } + + @Override + public int previousIndex() { + return i - 1; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + + @Override + public void set(final Kmer kmer) { + throw new UnsupportedOperationException(); + } + + @Override + public void add(final Kmer kmer) { + throw new UnsupportedOperationException(); + } + + } + +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerSequenceGraphMap.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerSequenceGraphMap.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerSequenceGraphMap.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerSequenceGraphMap.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RandomLikelihoodCalculationEngine.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RandomLikelihoodCalculationEngine.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RandomLikelihoodCalculationEngine.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RandomLikelihoodCalculationEngine.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadAnchoring.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadAnchoring.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadAnchoring.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadAnchoring.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadCost.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadCost.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadCost.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadCost.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadErrorCorrector.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadErrorCorrector.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadErrorCorrector.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadErrorCorrector.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadSegmentComparator.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadSegmentComparator.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadSegmentComparator.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadSegmentComparator.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadSegmentCost.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadSegmentCost.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadSegmentCost.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadSegmentCost.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RefVsAnyResult.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RefVsAnyResult.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RefVsAnyResult.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RefVsAnyResult.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModel.java new file mode 100644 index 000000000..7f7e65817 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModel.java @@ -0,0 +1,514 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import net.sf.samtools.*; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.haplotypeBAMWriter.HaplotypeBAMWriter; +import org.broadinstitute.sting.utils.haplotypeBAMWriter.ReadDestination; +import org.broadinstitute.sting.utils.locusiterator.LocusIteratorByState; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.*; +import org.broadinstitute.variant.vcf.VCFHeaderLine; +import org.broadinstitute.variant.vcf.VCFSimpleHeaderLine; + +import java.io.File; +import java.util.*; + +/** + * Code for estimating the reference confidence + * + * This code can estimate the probability that the data for a single sample is consistent with a + * well-determined REF/REF diploid genotype. + * + * User: depristo + * Date: 6/21/13 + * Time: 12:52 PM + */ +public class ReferenceConfidenceModel { + + //public final static String INDEL_INFORMATIVE_DEPTH = "CD"; // temporarily taking this extra genotype level information out for now + public final static String ALTERNATE_ALLELE_STRING = "ALT"; // arbitrary alternate allele + + private final GenomeLocParser genomeLocParser; + private final Set samples; + private final SAMFileHeader header; // TODO -- really shouldn't depend on this + private final int indelInformativeDepthIndelSize; + + private final static boolean WRITE_DEBUGGING_BAM = false; + private final SAMFileWriter debuggingWriter; + + private final static byte REF_MODEL_DELETION_QUAL = (byte) 30; + + /** + * Create a new ReferenceConfidenceModel + * + * @param genomeLocParser how we create genome locs + * @param samples the list of all samples we'll be considering with this model + * @param header the SAMFileHeader describing the read information (used for debugging) + * @param indelInformativeDepthIndelSize the max size of indels to consider when calculating indel informative depths + */ + public ReferenceConfidenceModel(final GenomeLocParser genomeLocParser, + final Set samples, + final SAMFileHeader header, + final int indelInformativeDepthIndelSize) { + if ( genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser cannot be null"); + if ( samples == null ) throw new IllegalArgumentException("samples cannot be null"); + if ( samples.isEmpty() ) throw new IllegalArgumentException("samples cannot be empty"); + if ( header == null ) throw new IllegalArgumentException("header cannot be empty"); + if ( indelInformativeDepthIndelSize < 0) throw new IllegalArgumentException("indelInformativeDepthIndelSize must be >= 1 but got " + indelInformativeDepthIndelSize); + + this.genomeLocParser = genomeLocParser; + this.samples = samples; + this.header = header; + this.indelInformativeDepthIndelSize = indelInformativeDepthIndelSize; + + if ( WRITE_DEBUGGING_BAM ) { + final SAMFileWriterFactory factory = new SAMFileWriterFactory(); + factory.setCreateIndex(true); + debuggingWriter = factory.makeBAMWriter(header, false, new File("refCalc.bam")); + } else { + debuggingWriter = null; + } + + initializeIndelPLCache(); + } + + /** + * Get the VCF header lines to include when emitting reference confidence values via calculateRefConfidence + * @return a non-null set of VCFHeaderLines + */ + public Set getVCFHeaderLines() { + final Set headerLines = new LinkedHashSet<>(); + // TODO - do we need a new kind of VCF Header subclass for specifying arbitrary alternate alleles? + headerLines.add(new VCFSimpleHeaderLine(ALTERNATE_ALLELE_STRING, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE_NAME, "Represents any possible alternative allele at this location")); + //headerLines.add(new VCFFormatHeaderLine(INDEL_INFORMATIVE_DEPTH, 1, VCFHeaderLineType.Integer, "Number of reads at locus that are informative about an indel of size <= " + indelInformativeDepthIndelSize)); + return headerLines; + } + + /** + * Close down this reference model, closing down any debugging information opened during execution + */ + public void close() { + if ( debuggingWriter != null ) debuggingWriter.close(); + } + + + /** + * Calculate the reference confidence for a single sample given the its read data + * + * Returns a list of variant contexts, one for each position in the activeregion.getLoc(), each containing + * detailed information about the certainty that the sample is hom-ref for each base in the region. + * + * + * + * @param refHaplotype the reference haplotype, used to get the reference bases across activeRegion.getLoc() + * @param calledHaplotypes a list of haplotypes that segregate in this region, for realignment of the reads in the + * stratifiedReadMap, corresponding to each reads best haplotype. Must contain the refHaplotype. + * @param paddedReferenceLoc the location of refHaplotype (which might be larger than activeRegion.getLoc()) + * @param activeRegion the active region we want to get the reference confidence over + * @param stratifiedReadMap a map from a single sample to its PerReadAlleleLikelihoodMap for each haplotype in calledHaplotypes + * @param variantCalls calls made in this region. The return result will contain any variant call in this list in the + * correct order by genomic position, and any variant in this list will stop us emitting a ref confidence + * under any position it covers (for snps and insertions that is 1 bp, but for deletions its the entire ref span) + * @return an ordered list of variant contexts that spans activeRegion.getLoc() and includes both reference confidence + * contexts as well as calls from variantCalls if any were provided + */ + public List calculateRefConfidence(final Haplotype refHaplotype, + final Collection calledHaplotypes, + final GenomeLoc paddedReferenceLoc, + final ActiveRegion activeRegion, + final Map stratifiedReadMap, + final List variantCalls) { + if ( refHaplotype == null ) throw new IllegalArgumentException("refHaplotype cannot be null"); + if ( calledHaplotypes == null ) throw new IllegalArgumentException("calledHaplotypes cannot be null"); + if ( !calledHaplotypes.contains(refHaplotype)) throw new IllegalArgumentException("calledHaplotypes must contain the refHaplotype"); + if ( paddedReferenceLoc == null ) throw new IllegalArgumentException("paddedReferenceLoc cannot be null"); + if ( activeRegion == null ) throw new IllegalArgumentException("activeRegion cannot be null"); + if ( stratifiedReadMap == null ) throw new IllegalArgumentException("stratifiedReadMap cannot be null"); + if ( stratifiedReadMap.size() != 1 ) throw new IllegalArgumentException("stratifiedReadMap must contain exactly one sample but it contained " + stratifiedReadMap.size()); + if ( refHaplotype.length() != activeRegion.getExtendedLoc().size() ) throw new IllegalArgumentException("refHaplotype " + refHaplotype.length() + " and activeRegion location size " + activeRegion.getLocation().size() + " are different"); + + final GenomeLoc refSpan = activeRegion.getLocation(); + final List refPileups = getPileupsOverReference(refHaplotype, calledHaplotypes, paddedReferenceLoc, activeRegion, refSpan, stratifiedReadMap); + final byte[] ref = refHaplotype.getBases(); + final List results = new ArrayList<>(refSpan.size()); + final String sampleName = stratifiedReadMap.keySet().iterator().next(); + + final int globalRefOffset = refSpan.getStart() - activeRegion.getExtendedLoc().getStart(); + for ( final ReadBackedPileup pileup : refPileups ) { + final GenomeLoc curPos = pileup.getLocation(); + final int offset = curPos.getStart() - refSpan.getStart(); + + final VariantContext overlappingSite = getOverlappingVariantContext(curPos, variantCalls); + if ( overlappingSite != null ) { + // we have some overlapping site, add it to the list of positions + if ( overlappingSite.getStart() == curPos.getStart() ) + results.add(overlappingSite); + } else { + // otherwise emit a reference confidence variant context + final int refOffset = offset + globalRefOffset; + final byte refBase = ref[refOffset]; + final RefVsAnyResult homRefCalc = calcGenotypeLikelihoodsOfRefVsAny(pileup, refBase, (byte)6, null); + homRefCalc.capByHomRefLikelihood(); + + final Allele refAllele = Allele.create(refBase, true); + final List refSiteAlleles = Arrays.asList(refAllele, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final VariantContextBuilder vcb = new VariantContextBuilder("HC", curPos.getContig(), curPos.getStart(), curPos.getStart(), refSiteAlleles); + final GenotypeBuilder gb = new GenotypeBuilder(sampleName, Arrays.asList(refAllele, refAllele)); + gb.AD(homRefCalc.AD_Ref_Any); + gb.DP(homRefCalc.getDP()); + + // genotype likelihood calculation + final GenotypeLikelihoods snpGLs = GenotypeLikelihoods.fromLog10Likelihoods(homRefCalc.genotypeLikelihoods); + final int nIndelInformativeReads = calcNIndelInformativeReads(pileup, refOffset, ref, indelInformativeDepthIndelSize); + final GenotypeLikelihoods indelGLs = getIndelPLs(nIndelInformativeReads); + + // now that we have the SNP and indel GLs, we take the one with the least confidence, + // as this is the most conservative estimate of our certainty that we are hom-ref. + // For example, if the SNP PLs are 0,10,100 and the indel PLs are 0,100,1000 + // we are very certain that there's no indel here, but the SNP confidence imply that we are + // far less confident that the ref base is actually the only thing here. So we take 0,10,100 + // as our GLs for the site. + final GenotypeLikelihoods leastConfidenceGLs = getGLwithWorstGQ(indelGLs, snpGLs); + + gb.GQ((int) (-10 * leastConfidenceGLs.getLog10GQ(GenotypeType.HOM_REF))); + gb.PL(leastConfidenceGLs.getAsPLs()); + //gb.attribute(INDEL_INFORMATIVE_DEPTH, nIndelInformativeReads); + + vcb.genotypes(gb.make()); + results.add(vcb.make()); +// logger.info(" => VariantContext " + vcb.make()); + } + } + + return results; + } + + /** + * Get the GenotypeLikelihoods with the least strong corresponding GQ value + * @param gl1 first to consider (cannot be null) + * @param gl2 second to consider (cannot be null) + * @return gl1 or gl2, whichever has the worst GQ + */ + protected final GenotypeLikelihoods getGLwithWorstGQ(final GenotypeLikelihoods gl1, final GenotypeLikelihoods gl2) { + return gl1.getLog10GQ(GenotypeType.HOM_REF) > gl2.getLog10GQ(GenotypeType.HOM_REF) ? gl1 : gl2; + } + + /** + * Get indel PLs corresponding to seeing N nIndelInformativeReads at this site + * + * @param nInformativeReads the number of reads that inform us about being ref without an indel at this site + * @return non-null GenotypeLikelihoods given N + */ + protected final GenotypeLikelihoods getIndelPLs(final int nInformativeReads) { + return indelPLCache[nInformativeReads > MAX_N_INDEL_INFORMATIVE_READS ? MAX_N_INDEL_INFORMATIVE_READS : nInformativeReads]; + } + + protected static final int MAX_N_INDEL_INFORMATIVE_READS = 40; // more than this is overkill because GQs are capped at 99 anyway + private static final GenotypeLikelihoods[] indelPLCache = new GenotypeLikelihoods[MAX_N_INDEL_INFORMATIVE_READS + 1]; + private static final double INDEL_ERROR_RATE = -4.5; // 10^-4.5 indel errors per bp + + private void initializeIndelPLCache() { + for( int nInformativeReads = 0; nInformativeReads <= MAX_N_INDEL_INFORMATIVE_READS; nInformativeReads++ ) { + final double homRef = 0.0; + final double het = MathUtils.LOG_ONE_HALF * nInformativeReads; + final double homVar = INDEL_ERROR_RATE * nInformativeReads; + indelPLCache[nInformativeReads] = GenotypeLikelihoods.fromLog10Likelihoods(new double[]{homRef, het, homVar}); + } + } + + /** + * Calculate the genotype likelihoods for the sample in pileup for being hom-ref contrasted with being ref vs. alt + * + * @param pileup the read backed pileup containing the data we want to evaluate + * @param refBase the reference base at this pileup position + * @param minBaseQual the min base quality for a read in the pileup at the pileup position to be included in the calculation + * @param hqSoftClips running average data structure (can be null) to collect information about the number of high quality soft clips + * @return a RefVsAnyResult genotype call + */ + public RefVsAnyResult calcGenotypeLikelihoodsOfRefVsAny(final ReadBackedPileup pileup, final byte refBase, final byte minBaseQual, final MathUtils.RunningAverage hqSoftClips) { + final RefVsAnyResult result = new RefVsAnyResult(); + + for( final PileupElement p : pileup ) { + final byte qual = (p.isDeletion() ? REF_MODEL_DELETION_QUAL : p.getQual()); + if( p.isDeletion() || qual > minBaseQual ) { + int AA = 0; final int AB = 1; int BB = 2; + if( p.getBase() != refBase || p.isDeletion() || p.isBeforeDeletionStart() || p.isAfterDeletionEnd() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip() ) { + AA = 2; + BB = 0; + if( hqSoftClips != null && p.isNextToSoftClip() ) { + hqSoftClips.add(AlignmentUtils.calcNumHighQualitySoftClips(p.getRead(), (byte) 28)); + } + result.AD_Ref_Any[1] += p.getRepresentativeCount(); + } else { + result.AD_Ref_Any[0] += p.getRepresentativeCount(); + } + result.genotypeLikelihoods[AA] += p.getRepresentativeCount() * QualityUtils.qualToProbLog10(qual); + result.genotypeLikelihoods[AB] += p.getRepresentativeCount() * MathUtils.approximateLog10SumLog10( QualityUtils.qualToProbLog10(qual) + MathUtils.LOG_ONE_HALF, QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD + MathUtils.LOG_ONE_HALF ); + result.genotypeLikelihoods[BB] += p.getRepresentativeCount() * QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD; + } + } + + return result; + } + + /** + * Get a list of pileups that span the entire active region span, in order, one for each position + */ + private List getPileupsOverReference(final Haplotype refHaplotype, + final Collection calledHaplotypes, + final GenomeLoc paddedReferenceLoc, + final ActiveRegion activeRegion, + final GenomeLoc activeRegionSpan, + final Map stratifiedReadMap) { + + if ( refHaplotype == null ) throw new IllegalArgumentException("refHaplotype cannot be null"); + if ( calledHaplotypes == null ) throw new IllegalArgumentException("calledHaplotypes cannot be null"); + if ( !calledHaplotypes.contains(refHaplotype)) throw new IllegalArgumentException("calledHaplotypes must contain the refHaplotype"); + if ( paddedReferenceLoc == null ) throw new IllegalArgumentException("paddedReferenceLoc cannot be null"); + if ( activeRegion == null ) throw new IllegalArgumentException("activeRegion cannot be null"); + if ( stratifiedReadMap == null ) throw new IllegalArgumentException("stratifiedReadMap cannot be null"); + if ( stratifiedReadMap.size() != 1 ) throw new IllegalArgumentException("stratifiedReadMap must contain exactly one sample but it contained " + stratifiedReadMap.size()); + + List realignedReads; + + if( calledHaplotypes.size() == 1 ) { // only contains ref haplotype so an optimization is to just trust the alignments to the reference haplotype as provided by the aligner + realignedReads = activeRegion.getReads(); + } else { + final ReadDestination.ToList realignedReadsDest = new ReadDestination.ToList(header, "FOO"); + final HaplotypeBAMWriter writer = HaplotypeBAMWriter.create(HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES, realignedReadsDest); + writer.setWriteHaplotypesAsWell(false); // don't write out reads for the haplotypes, as we only want the realigned reads themselves + writer.setOnlyRealignInformativeReads(true); + writer.writeReadsAlignedToHaplotypes(calledHaplotypes, paddedReferenceLoc, stratifiedReadMap); + realignedReads = ReadUtils.sortReadsByCoordinate(realignedReadsDest.getReads()); + } + + if ( debuggingWriter != null ) + for ( final GATKSAMRecord read : realignedReads ) + debuggingWriter.addAlignment(read); + + final LocusIteratorByState libs = new LocusIteratorByState(realignedReads.iterator(), LocusIteratorByState.NO_DOWNSAMPLING, + true, genomeLocParser, samples, false); + + final List pileups = new LinkedList<>(); + final int startPos = activeRegionSpan.getStart(); + AlignmentContext next = libs.advanceToLocus(startPos, true); + for ( int curPos = startPos; curPos <= activeRegionSpan.getStop(); curPos++ ) { + if ( next != null && next.getLocation().getStart() == curPos ) { + pileups.add(next.getBasePileup()); + next = libs.hasNext() ? libs.next() : null; + } else { + // no data, so we create empty pileups + pileups.add(new ReadBackedPileupImpl(genomeLocParser.createGenomeLoc(activeRegionSpan.getContig(), curPos))); + } + } + + return pileups; + } + + /** + * Return the rightmost variant context in maybeOverlapping that overlaps curPos + * + * @param curPos non-null genome loc + * @param maybeOverlapping a collection of variant contexts that might overlap curPos + * @return a VariantContext, or null if none overlaps + */ + protected final VariantContext getOverlappingVariantContext(final GenomeLoc curPos, final Collection maybeOverlapping) { + VariantContext overlaps = null; + for ( final VariantContext vc : maybeOverlapping ) { + if ( genomeLocParser.createGenomeLoc(vc).overlapsP(curPos) ) { + if ( overlaps == null || vc.getStart() > overlaps.getStart() ) { + overlaps = vc; + } + } + } + return overlaps; + } + + /** + * Compute the sum of mismatching base qualities for readBases aligned to refBases at readStart / refStart + * assuming no insertions or deletions in the read w.r.t. the reference + * + * @param readBases non-null bases of the read + * @param readQuals non-null quals of the read + * @param readStart the starting position of the read (i.e., that aligns it to a position in the reference) + * @param refBases the reference bases + * @param refStart the offset into refBases that aligns to the readStart position in readBases + * @param maxSum if the sum goes over this value, return immediately + * @return the sum of quality scores for readBases that mismatch their corresponding ref bases + */ + protected final int sumMismatchingQualities(final byte[] readBases, + final byte[] readQuals, + final int readStart, + final byte[] refBases, + final int refStart, + final int maxSum) { + final int n = Math.min(readBases.length - readStart, refBases.length - refStart); + int sum = 0; + + for ( int i = 0; i < n; i++ ) { + final byte readBase = readBases[readStart + i]; + final byte refBase = refBases[refStart + i]; + if ( readBase != refBase ) { + sum += readQuals[readStart + i]; + if ( sum > maxSum ) // abort early + return sum; + } + } + + return sum; + } + + /** + * Compute whether a read is informative to eliminate an indel of size <= maxIndelSize segregating at readStart/refStart + * + * @param readBases non-null bases of the read + * @param readQuals non-null quals of the read + * @param readStart the starting position of the read (i.e., that aligns it to a position in the reference) + * @param refBases the reference bases + * @param refStart the offset into refBases that aligns to the readStart position in readBases + * @param maxIndelSize the max indel size to consider for the read to be informative + * @return true if read can eliminate the possibility that there's an indel of size <= maxIndelSize segregating at refStart + */ + protected boolean isReadInformativeAboutIndelsOfSize(final byte[] readBases, + final byte[] readQuals, + final int readStart, + final byte[] refBases, + final int refStart, + final int maxIndelSize) { + // fast exit when n bases left < maxIndelSize + if( readBases.length - readStart < maxIndelSize || refBases.length - refStart < maxIndelSize ) { + return false; + } + + final int baselineMMSum = sumMismatchingQualities(readBases, readQuals, readStart, refBases, refStart, Integer.MAX_VALUE); + + // consider each indel size up to max in term, checking if an indel that deletes either the ref bases (deletion + // or read bases (insertion) would fit as well as the origin baseline sum of mismatching quality scores + for ( int indelSize = 1; indelSize <= maxIndelSize; indelSize++ ) { + for ( final boolean checkInsertion : Arrays.asList(true, false) ) { + final int readI, refI; + if ( checkInsertion ) { + readI = readStart + indelSize; + refI = refStart; + } else { + readI = readStart; + refI = refStart + indelSize; + } + + final int score = sumMismatchingQualities(readBases, readQuals, readI, refBases, refI, baselineMMSum); + if ( score <= baselineMMSum ) + return false; + } + } + + return true; + } + + /** + * Calculate the number of indel informative reads at pileup + * + * @param pileup a pileup + * @param pileupOffsetIntoRef the position of the pileup in the reference + * @param ref the ref bases + * @param maxIndelSize maximum indel size to consider in the informativeness calculation + * @return an integer >= 0 + */ + protected final int calcNIndelInformativeReads(final ReadBackedPileup pileup, final int pileupOffsetIntoRef, final byte[] ref, final int maxIndelSize) { + int nInformative = 0; + for ( final PileupElement p : pileup ) { + final GATKSAMRecord read = p.getRead(); + final int offset = p.getOffset(); + + // doesn't count as evidence + if ( p.isBeforeDeletionStart() || p.isBeforeInsertion() || p.isDeletion() ) + continue; + + // todo -- this code really should handle CIGARs directly instead of relying on the above tests + if ( isReadInformativeAboutIndelsOfSize(read.getReadBases(), read.getBaseQualities(), offset, ref, pileupOffsetIntoRef, maxIndelSize) ) { + nInformative += p.getRepresentativeCount(); + if( nInformative > MAX_N_INDEL_INFORMATIVE_READS ) { + return MAX_N_INDEL_INFORMATIVE_READS; + } + } + } + return nInformative; + } + + /** + * Create a reference haplotype for an active region + * + * @param activeRegion the active region + * @param refBases the ref bases + * @param paddedReferenceLoc the location spanning of the refBases -- can be longer than activeRegion.getLocation() + * @return a reference haplotype + */ + public static Haplotype createReferenceHaplotype(final ActiveRegion activeRegion, final byte[] refBases, final GenomeLoc paddedReferenceLoc) { + final Haplotype refHaplotype = new Haplotype(refBases, true); + final int alignmentStart = activeRegion.getExtendedLoc().getStart() - paddedReferenceLoc.getStart(); + if ( alignmentStart < 0 ) throw new IllegalStateException("Bad alignment start in createReferenceHaplotype " + alignmentStart); + refHaplotype.setAlignmentStartHapwrtRef(alignmentStart); + final Cigar c = new Cigar(); + c.add(new CigarElement(refHaplotype.getBases().length, CigarOperator.M)); + refHaplotype.setCigar(c); + return refHaplotype; + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdge.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdge.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdge.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdge.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphIterator.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphIterator.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphIterator.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphIterator.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertex.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertex.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertex.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertex.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtils.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtils.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtils.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtils.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPaths.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPaths.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPaths.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPaths.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KmerSearchableGraph.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KmerSearchableGraph.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KmerSearchableGraph.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KmerSearchableGraph.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/LowWeightChainPruner.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/LowWeightChainPruner.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/LowWeightChainPruner.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/LowWeightChainPruner.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/MultiSampleEdge.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/MultiSampleEdge.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/MultiSampleEdge.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/MultiSampleEdge.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Route.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Route.java new file mode 100644 index 000000000..4eeb18eb6 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Route.java @@ -0,0 +1,285 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + + +import java.util.List; +import java.util.ListIterator; + +/** + * Represents a route or path through a graph. + *

+ * In contrast with a {@link Path}, a route keeps track of the + * path taken at furcations in order to speed up some path comparisions like the + * one implemented by {@link #isSuffix}. + *

+ * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class Route extends Path { + + protected final Route previousRouteWithLastVertexThatIsForkOrJoin; + protected final boolean lastVertexIsForkOrJoin; + + /** + * Create a zero length route with a start in a particular vertex: + * + * @param initialVertex the first vertex of the route. + * @param graph the new route's graph. + * + * @throws IllegalArgumentException if {@code initialVertex} or {@code graph} are {@code null}. + * or if {@code initialVertex} does not belong to {@code graph}. + */ + public Route(final V initialVertex, final BaseGraph graph) { + super(initialVertex, graph); + previousRouteWithLastVertexThatIsForkOrJoin = null; + lastVertexIsForkOrJoin = graph.inDegreeOf(initialVertex) > 1; + } + + @Override + public boolean equals(final Object other) { + if (other == null) return false; + if (other == this) return true; + if (! (other instanceof Route)) return false; + @SuppressWarnings("unchecked") + final Route otherRoute = (Route) other; + return otherRoute.length() == this.length() && isSuffix(otherRoute); + } + + /** + * Extends a route into a new instance. + * + * @param prefix the route to extend. + * @param nextVertex the vertex to extend the route to. + * + * @throws IllegalArgumentException if {@code prefix} is {@code null} or {@code nextVertex} is {@code null} + * or {@code nextVertex} does not belong to {@code prefix}'s graph or there is no edge that in the graph + * that would connect {@code prefix}'s last vertex with {@code nextVertex} directly. + */ + public Route(final Route prefix, final V nextVertex) { + this(prefix,resolveSuffixEdge(prefix,nextVertex)); + } + + + /** + * Extends a route into a new instance. + * + * @param prevVertex the vertex to extend the route to. + * @param suffix the route to extend. + * + * @throws IllegalArgumentException if {@code suffix} is {@code null} or {@code prevVertex} is {@code null} + * or {@code prevVertex} does not belong to {@code suffix}'s graph or there is no edge that in the graph + * that would connect {@code suffix}'s first vertex with {@code prevVertex} directly. + */ + public Route(final V prevVertex, final Route suffix) { + this(resolvePrefixEdge(prevVertex, suffix),suffix); + } + + /** + * Resolves the prefix edge as required by {@link Route(V,Route)}. + */ + private static E resolvePrefixEdge(final V prevVertex, final Route suffix) { + if (prevVertex == null) throw new NullPointerException(); + if (!suffix.getGraph().containsVertex(prevVertex)) throw new IllegalArgumentException(); + final E result = suffix.getGraph().getEdge(prevVertex,suffix.getFirstVertex()); + if (result == null) + throw new IllegalArgumentException("there is no such edge in the graph"); + return result; + } + + /** + * Resolves the suffix edge as required by {@link Route(Route,V)} + */ + private static E resolveSuffixEdge(final Route prefix, final V nextVertex) { + if (nextVertex == null) throw new IllegalArgumentException(); + if (!prefix.getGraph().containsVertex(nextVertex)) throw new IllegalArgumentException(); + final E result = prefix.getGraph().getEdge(prefix.getLastVertex(),nextVertex); + if (result == null) + throw new IllegalArgumentException("there is no such edge in the graph"); + return result; + } + + /** + * Extends a route by prefixing an edge. + * + * @param initialEdge the extending edge. + * @param suffix the original path. + * + * @throws IllegalArgumentException if {@code suffix} or {@code initialEdge} are {@code null}, or {@code initialEdge} is + * not part of {@code suffix}'s graph, or {@code initialEdge} does not have as a target the first vertex in {@code suffix}. + */ + public Route(final E initialEdge, final Route suffix) { + super(initialEdge,suffix); + final V firstVertex = getFirstVertex(); + if(suffix.length() == 0) { + lastVertexIsForkOrJoin = suffix.lastVertexIsForkOrJoin || graph.outDegreeOf(firstVertex) > 1; + previousRouteWithLastVertexThatIsForkOrJoin = graph.inDegreeOf(firstVertex) > 1 ? new Route<>(firstVertex,graph) : null; + } else { + lastVertexIsForkOrJoin = suffix.lastVertexIsForkOrJoin; + if (suffix.previousRouteWithLastVertexThatIsForkOrJoin != null) + previousRouteWithLastVertexThatIsForkOrJoin = new Route<>(initialEdge,suffix.previousRouteWithLastVertexThatIsForkOrJoin); + else + previousRouteWithLastVertexThatIsForkOrJoin = graph.outDegreeOf(firstVertex) > 1 ? + new Route<>(new Route<>(firstVertex,graph),edgesInOrder.get(0)) : + graph.inDegreeOf(firstVertex) > 1 ? new Route<>(firstVertex,graph) : null; + } + } + + /** + * Create copy of an existing route. + * @param route the route to copy + * + * @throws NullPointerException if {@code route} is {@code null}. + */ + protected Route(final Route route) { + super(route); + lastVertexIsForkOrJoin = route.lastVertexIsForkOrJoin; + previousRouteWithLastVertexThatIsForkOrJoin = route.previousRouteWithLastVertexThatIsForkOrJoin; + } + + /** + * Create a new Route extending another one with an edge + * + * @param route the route to extend. + * @param edge the edge to extend the route with. + * + * @throws IllegalArgumentException if {@code route} or {@code edge} are {@code null}, or {@code edge} is + * not part of {@code route}'s graph, or {@code edge} does not have as a source the last vertex in {@code route}. + */ + public Route(final Route route, final E edge) { + super(route, edge); + lastVertexIsForkOrJoin = graph.outDegreeOf(route.lastVertex) > 1 || graph.inDegreeOf(lastVertex) > 1; + previousRouteWithLastVertexThatIsForkOrJoin = route.lastVertexIsForkOrJoin ? route : route.previousRouteWithLastVertexThatIsForkOrJoin; + } + + @Override + public boolean isSuffix(final Path other) { + if (other == this) + return true; + else if (other == null) + throw new IllegalArgumentException("other path must not be null"); + else if (getGraph() != other.getGraph()) + throw new IllegalArgumentException("other path must be part of the same graph"); + else if (other instanceof Route) + return isRouteSuffix((Route)other); + else + return super.isSuffix(other); + } + + @Override + public String toString() { + return super.toString().replace("Path{", "Route{"); + } + + /** + * Faster version when comparing with a route. + */ + protected boolean isRouteSuffix(final Route other) { + if (other.getGraph() != this.getGraph()) + throw new IllegalArgumentException("you cannot compare routes on different graphs"); + else if (lastVertex != other.lastVertex) // obvious case. + return false; + else if (this.previousRouteWithLastVertexThatIsForkOrJoin == null + && other.previousRouteWithLastVertexThatIsForkOrJoin != null) // I am shorter or different path for sure. + return false; + else if (this.edgesInOrder.size() < other.edgesInOrder.size()) // I am shorter regardless of path, no way Jose! + return false; + else if (this.previousRouteWithLastVertexThatIsForkOrJoin == null || other.previousRouteWithLastVertexThatIsForkOrJoin == null) { + final ListIterator myEdges = edgesInOrder.listIterator(edgesInOrder.size()); + final ListIterator otherEdges = other.edgesInOrder.listIterator(other.edgesInOrder.size()); + while (otherEdges.hasPrevious()) + if (myEdges.previous() != otherEdges.previous()) + return false; + return true; + } else + return (other.previousRouteWithLastVertexThatIsForkOrJoin == this.previousRouteWithLastVertexThatIsForkOrJoin) + || (previousRouteWithLastVertexThatIsForkOrJoin.lastVertex == other.previousRouteWithLastVertexThatIsForkOrJoin.lastVertex + && previousRouteWithLastVertexThatIsForkOrJoin.isRouteSuffix(other.previousRouteWithLastVertexThatIsForkOrJoin)); + } + + /** + * Checks whether the last vertex in the route is a fork or a joining vertex. + * @return {@code true} iff so. + */ + public boolean lastVertexIsForkOrJoin() { + return lastVertexIsForkOrJoin; + } + + /** + * Returns the longest prefix route that has as a last vertex a join or furcation vertex. + * + * @return never {@code null}. + */ + public Route getPrefixRouteWithLastVertexThatIsForkOrJoin() { + return previousRouteWithLastVertexThatIsForkOrJoin; + } + + + + /** + * Splice out the first few vertices of the route. + * + * @param length how many vertices to splice out + * @return a new route without those spliced vertices. + * + * @throws IllegalArgumentException if {@code length} is equal to the route's length or greater or if it is negative. + * Notice that non-vertex route are no legal routes. + */ + public Route splicePrefix(final int length) { + if (length == 0) + return this; + if (length >= length()) + throw new IllegalArgumentException("prefix slicing to long"); + if (length < 0) + throw new IllegalArgumentException("prefix cannot be negative"); + + final List resultEdges = getEdges().subList(length,length()); + Route result = new Route<>(graph.getEdgeSource(resultEdges.get(0)),graph); + for (final E edge : resultEdges) + result = new Route<>(result,edge); + return result; + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RouteFinder.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RouteFinder.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RouteFinder.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RouteFinder.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertex.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertex.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertex.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertex.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/TestGraph.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/TestGraph.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/TestGraph.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/TestGraph.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/VertexOrder.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/VertexOrder.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/VertexOrder.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/VertexOrder.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/HaplotypeGraph.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/HaplotypeGraph.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/HaplotypeGraph.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/HaplotypeGraph.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/MultiDeBruijnVertex.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/MultiDeBruijnVertex.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/MultiDeBruijnVertex.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/MultiDeBruijnVertex.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/SequenceForKmers.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/SequenceForKmers.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/SequenceForKmers.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/SequenceForKmers.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/ConstrainedMateFixingManager.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/indels/ConstrainedMateFixingManager.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/ConstrainedMateFixingManager.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/indels/ConstrainedMateFixingManager.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java new file mode 100644 index 000000000..a9b14e40b --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -0,0 +1,1645 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.indels; + +import net.sf.samtools.*; +import net.sf.samtools.util.RuntimeIOException; +import net.sf.samtools.util.SequenceUtil; +import net.sf.samtools.util.StringUtil; +import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.BAQMode; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.smithwaterman.Parameters; +import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.baq.BAQ; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.NWaySAMFileWriter; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.sting.utils.text.TextFormattingUtils; +import org.broadinstitute.sting.utils.text.XReadLines; +import org.broadinstitute.variant.variantcontext.VariantContext; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileWriter; +import java.io.IOException; +import java.util.*; + +/** + * Performs local realignment of reads to correct misalignments due to the presence of indels. + * + *

+ * The local realignment tool is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases + * is minimized across all the reads. In general, a large percent of regions requiring local realignment are due to the presence of an insertion + * or deletion (indels) in the individual's genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching + * the reference near the misalignment, which are easily mistaken as SNPs. Moreover, since read mapping algorithms operate on each read independently, + * it is impossible to place reads on the reference genome such at mismatches are minimized across all reads. Consequently, even when some reads are + * correctly mapped with indels, reads covering the indel near just the start or end of the read are often incorrectly mapped with respect the true indel, + * also requiring realignment. Local realignment serves to transform regions with misalignments due to indels into clean reads containing a consensus + * indel suitable for standard variant discovery approaches. Unlike most mappers, this walker uses the full alignment context to determine whether an + * appropriate alternate reference (i.e. indel) exists. Following local realignment, the GATK tool Unified Genotyper can be used to sensitively and + * specifically identify indels. + *

+ *
    There are 2 steps to the realignment process: + *
  1. Determining (small) suspicious intervals which are likely in need of realignment (see the RealignerTargetCreator tool)
  2. + *
  3. Running the realigner over those intervals (IndelRealigner)
  4. + *
+ *

+ * For more details, see http://www.broadinstitute.org/gatk/guide/article?id=38 + *

+ * + *

Input

+ *

+ * One or more aligned BAM files and optionally one or more lists of known indels. + *

+ * + *

Output

+ *

+ * A realigned version of your input BAM file(s). + *

+ * + *

Example

+ *
+ * java -Xmx4g -jar GenomeAnalysisTK.jar \
+ *   -T IndelRealigner \
+ *   -R ref.fasta \
+ *   -I input.bam \
+ *   -targetIntervals intervalListFromRTC.intervals \
+ *   -o realignedBam.bam \
+ *   [-known /path/to/indels.vcf] \
+ *   [-compress 0]    (this argument recommended to speed up the process *if* this is only a temporary file; otherwise, use the default value)
+ * 
+ * + *

Caveats

+ * + *
  • + * An important note: the input bam(s), reference, and known indel file(s) should be the same ones used for the RealignerTargetCreator step. + *
  • + * Another important note: because reads produced from the 454 technology inherently contain false indels, the realigner will not currently work with them + * (or with reads from similar technologies). + *
+ * + * @author ebanks + */ +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_DATA, extraDocs = {CommandLineGATK.class} ) +@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = ReadTransformer.ApplicationTime.ON_OUTPUT) +public class IndelRealigner extends ReadWalker { + + public static final String ORIGINAL_CIGAR_TAG = "OC"; + public static final String ORIGINAL_POSITION_TAG = "OP"; + public static final String PROGRAM_RECORD_NAME = "GATK IndelRealigner"; + + public enum ConsensusDeterminationModel { + /** + * Uses only indels from a provided ROD of known indels. + */ + KNOWNS_ONLY, + /** + * Additionally uses indels already present in the original alignments of the reads. + */ + USE_READS, + /** + * Additionally uses 'Smith-Waterman' to generate alternate consenses. + */ + USE_SW + } + + /** + * Any number of VCF files representing known indels to be used for constructing alternate consenses. + * Could be e.g. dbSNP and/or official 1000 Genomes indel calls. Non-indel variants in these files will be ignored. + */ + @Input(fullName="knownAlleles", shortName = "known", doc="Input VCF file(s) with known indels", required=false) + public List> known = Collections.emptyList(); + + /** + * The interval list output from the RealignerTargetCreator tool using the same bam(s), reference, and known indel file(s). + */ + @Input(fullName="targetIntervals", shortName="targetIntervals", doc="Intervals file output from RealignerTargetCreator", required=true) + protected IntervalBinding intervalsFile = null; + + /** + * This term is equivalent to "significance" - i.e. is the improvement significant enough to merit realignment? Note that this number + * should be adjusted based on your particular data set. For low coverage and/or when looking for indels with low allele frequency, + * this number should be smaller. + */ + @Argument(fullName="LODThresholdForCleaning", shortName="LOD", doc="LOD threshold above which the cleaner will clean", required=false) + protected double LOD_THRESHOLD = 5.0; + + /** + * The realigned bam file. + */ + @Output(required=false, doc="Output bam", defaultToStdout=false) + protected StingSAMFileWriter writer = null; + protected ConstrainedMateFixingManager manager = null; + protected SAMFileWriter writerToUse = null; + + /** + * We recommend that users run with USE_READS when trying to realign high quality longer read data mapped with a gapped aligner; + * Smith-Waterman is really only necessary when using an ungapped aligner (e.g. MAQ in the case of single-end read data). + */ + @Argument(fullName = "consensusDeterminationModel", shortName = "model", doc = "Determines how to compute the possible alternate consenses", required = false) + public ConsensusDeterminationModel consensusModel = ConsensusDeterminationModel.USE_READS; + + + // ADVANCED OPTIONS FOLLOW + + /** + * For expert users only! This is similar to the argument in the RealignerTargetCreator walker. The point here is that the realigner + * will only proceed with the realignment (even above the given threshold) if it minimizes entropy among the reads (and doesn't simply + * push the mismatch column to another position). This parameter is just a heuristic and should be adjusted based on your particular data set. + */ + @Advanced + @Argument(fullName="entropyThreshold", shortName="entropy", doc="Percentage of mismatches at a locus to be considered having high entropy (0.0 < entropy <= 1.0)", required=false) + protected double MISMATCH_THRESHOLD = 0.15; + + /** + * For expert users only! To minimize memory consumption you can lower this number (but then the tool may skip realignment on regions with too much coverage; + * and if the number is too low, it may generate errors during realignment). Just make sure to give Java enough memory! 4Gb should be enough with the default value. + */ + @Advanced + @Argument(fullName="maxReadsInMemory", shortName="maxInMemory", doc="max reads allowed to be kept in memory at a time by the SAMFileWriter", required=false) + protected int MAX_RECORDS_IN_MEMORY = 150000; + + /** + * For expert users only! + */ + @Advanced + @Argument(fullName="maxIsizeForMovement", shortName="maxIsize", doc="maximum insert size of read pairs that we attempt to realign", required=false) + protected int MAX_ISIZE_FOR_MOVEMENT = 3000; + + /** + * For expert users only! + */ + @Advanced + @Argument(fullName="maxPositionalMoveAllowed", shortName="maxPosMove", doc="Maximum positional move in basepairs that a read can be adjusted during realignment", required=false) + protected int MAX_POS_MOVE_ALLOWED = 200; + + /** + * For expert users only! If you need to find the optimal solution regardless of running time, use a higher number. + */ + @Advanced + @Argument(fullName="maxConsensuses", shortName="maxConsensuses", doc="Max alternate consensuses to try (necessary to improve performance in deep coverage)", required=false) + protected int MAX_CONSENSUSES = 30; + + /** + * For expert users only! If you need to find the optimal solution regardless of running time, use a higher number. + */ + @Advanced + @Argument(fullName="maxReadsForConsensuses", shortName="greedy", doc="Max reads used for finding the alternate consensuses (necessary to improve performance in deep coverage)", required=false) + protected int MAX_READS_FOR_CONSENSUSES = 120; + + /** + * For expert users only! If this value is exceeded at a given interval, realignment is not attempted and the reads are passed to the output file(s) as-is. + * If you need to allow more reads (e.g. with very deep coverage) regardless of memory, use a higher number. + */ + @Advanced + @Argument(fullName="maxReadsForRealignment", shortName="maxReads", doc="Max reads allowed at an interval for realignment", required=false) + protected int MAX_READS = 20000; + + @Advanced + @Argument(fullName="noOriginalAlignmentTags", shortName="noTags", required=false, doc="Don't output the original cigar or alignment start tags for each realigned read in the output bam") + protected boolean NO_ORIGINAL_ALIGNMENT_TAGS = false; + + /** + * Reads from all input files will be realigned together, but then each read will be saved in the output file corresponding to the input file that + * the read came from. There are two ways to generate output bam file names: 1) if the value of this argument is a general string (e.g. '.cleaned.bam'), + * then extensions (".bam" or ".sam") will be stripped from the input file names and the provided string value will be pasted on instead; 2) if the + * value ends with a '.map' (e.g. input_output.map), then the two-column tab-separated file with the specified name must exist and list unique output + * file name (2nd column) for each input file name (1st column). + * + * Note that some GATK arguments do NOT work in conjunction with nWayOut (e.g. --disable_bam_indexing). + */ + @Argument(fullName="nWayOut", shortName="nWayOut", required=false, doc="Generate one output file for each input (-I) bam file (not compatible with -output)") + protected String N_WAY_OUT = null; + + @Hidden + @Argument(fullName="generate_nWayOut_md5s",doc="Generate md5sums for BAMs") + protected boolean generateMD5s = false; + + // DEBUGGING OPTIONS FOLLOW + + @Hidden + @Argument(fullName="check_early",shortName="check_early",required=false,doc="Do early check of reads against existing consensuses") + protected boolean CHECKEARLY = false; + + @Hidden + @Argument(fullName="noPGTag", shortName="noPG", required=false, + doc="Don't output the usual PG tag in the realigned bam file header. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.") + protected boolean NO_PG_TAG = false; + + @Hidden + @Argument(fullName="keepPGTags", shortName="keepPG", required=false, + doc="Keep older PG tags left in the bam header by previous runs of this tool (by default, all these "+ + "historical tags will be replaced by the latest tag generated in the current run).") + protected boolean KEEP_ALL_PG_RECORDS = false; + + @Hidden + @Output(fullName="indelsFileForDebugging", shortName="indels", required=false, defaultToStdout=false, doc="Output file (text) for the indels found; FOR DEBUGGING PURPOSES ONLY") + protected String OUT_INDELS = null; + + @Hidden + @Output(fullName="statisticsFileForDebugging", shortName="stats", doc="print out statistics (what does or doesn't get cleaned); FOR DEBUGGING PURPOSES ONLY", required=false, defaultToStdout=false) + protected String OUT_STATS = null; + + @Hidden + @Output(fullName="SNPsFileForDebugging", shortName="snps", doc="print out whether mismatching columns do or don't get cleaned out; FOR DEBUGGING PURPOSES ONLY", required=false, defaultToStdout=false) + protected String OUT_SNPS = null; + + // fasta reference reader to supplement the edges of the reference sequence + private CachingIndexedFastaSequenceFile referenceReader; + + // the intervals input by the user + private Iterator intervals = null; + + // the current interval in the list + private GenomeLoc currentInterval = null; + private boolean sawReadInCurrentInterval = false; + + // the reads and known indels that fall into the current interval + private ReadBin readsToClean; + private final ArrayList readsNotToClean = new ArrayList(); + private final ArrayList knownIndelsToTry = new ArrayList(); + private final HashSet indelRodsSeen = new HashSet(); + private final HashSet readsActuallyCleaned = new HashSet(); + + private static final int MAX_QUAL = 99; + + // fraction of mismatches that need to no longer mismatch for a column to be considered cleaned + private static final double MISMATCH_COLUMN_CLEANED_FRACTION = 0.75; + + private final static Parameters swParameters = new Parameters(30.0, -10.0, -10.0, -2.0); + + // reference base padding size + // TODO -- make this a command-line argument if the need arises + private static final int REFERENCE_PADDING = 30; + + // other output files + private FileWriter indelOutput = null; + private FileWriter statsOutput = null; + private FileWriter snpsOutput = null; + + //###protected Map nwayWriters = null; + + + // debug info for lazy SW evaluation: + private long exactMatchesFound = 0; // how many reads exactly matched a consensus we already had + private long SWalignmentRuns = 0; // how many times (=for how many reads) we ran SW alignment + private long SWalignmentSuccess = 0; // how many SW alignments were "successful" (i.e. found a workable indel and resulted in non-null consensus) + + private Map loadFileNameMap(String mapFile) { + Map fname_map = new HashMap(); + + try { + + XReadLines reader = new XReadLines(new File(mapFile),true); + for ( String line : reader ) { + if ( line.length() == 0 ) continue; + + String fields[] = line.split("\t"); + + if ( fields.length != 2 ) + throw new UserException.BadInput("Input-output map file must have exactly two columns. Offending line:\n"+line); + if ( fields[0].length() == 0 || fields[1].length() == 0 ) + throw new UserException.BadInput("Input-output map file can not have empty strings in either column. Offending line:\n"+line); + + if ( fname_map.containsKey(fields[0]) ) + throw new UserException.BadInput("Input-output map file contains duplicate entries for input name "+fields[0]); + if ( fname_map.containsValue(fields[1]) ) + throw new UserException.BadInput("Input-output map file maps multiple entries onto single output name "+fields[1]); + + fname_map.put(fields[0],fields[1]); + } + } catch (IOException e) { + throw new StingException("I/O Error while reading input-output map file "+N_WAY_OUT+": "+e.getMessage()); + } + return fname_map; + } + + public void initialize() { + readsToClean = new ReadBin(getToolkit().getGenomeLocParser(), REFERENCE_PADDING); + + if ( N_WAY_OUT == null && writer == null ) { + throw new UserException.CommandLineException("Either -o or -nWayOut must be specified"); + } + if ( N_WAY_OUT != null && writer != null ) { + throw new UserException.CommandLineException("-o and -nWayOut can not be used simultaneously"); + } + if ( LOD_THRESHOLD < 0.0 ) + throw new RuntimeException("LOD threshold cannot be a negative number"); + if ( MISMATCH_THRESHOLD <= 0.0 || MISMATCH_THRESHOLD > 1.0 ) + throw new RuntimeException("Entropy threshold must be a fraction between 0 and 1"); + + try { + referenceReader = new CachingIndexedFastaSequenceFile(getToolkit().getArguments().referenceFile); + } + catch(FileNotFoundException ex) { + throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile,ex); + } + + intervals = intervalsFile.getIntervals(getToolkit()).iterator(); + + currentInterval = intervals.hasNext() ? intervals.next() : null; + + if ( N_WAY_OUT != null ) { + boolean createIndex = true; + + if ( N_WAY_OUT.toUpperCase().endsWith(".MAP") ) { + writerToUse = new NWaySAMFileWriter(getToolkit(),loadFileNameMap(N_WAY_OUT), + SAMFileHeader.SortOrder.coordinate,true, createIndex, generateMD5s,createProgramRecord(),KEEP_ALL_PG_RECORDS); + } else { + writerToUse = new NWaySAMFileWriter(getToolkit(),N_WAY_OUT,SAMFileHeader.SortOrder.coordinate,true, + createIndex, generateMD5s,createProgramRecord(),KEEP_ALL_PG_RECORDS); + } + } else { + // set up the output writer + setupWriter(getToolkit().getSAMFileHeader()); + writerToUse = writer; + } + manager = new ConstrainedMateFixingManager(writerToUse, getToolkit().getGenomeLocParser(), MAX_ISIZE_FOR_MOVEMENT, MAX_POS_MOVE_ALLOWED, MAX_RECORDS_IN_MEMORY); + + if ( OUT_INDELS != null ) { + try { + indelOutput = new FileWriter(new File(OUT_INDELS)); + } catch (Exception e) { + logger.error("Failed to create output file "+ OUT_INDELS+". Indel output will be suppressed"); + logger.error(e.getMessage()); + indelOutput = null; + } + } + if ( OUT_STATS != null ) { + try { + statsOutput = new FileWriter(new File(OUT_STATS)); + } catch (Exception e) { + logger.error("Failed to create output file "+ OUT_STATS+". Cleaning stats output will be suppressed"); + logger.error(e.getMessage()); + statsOutput = null; + } + } + if ( OUT_SNPS != null ) { + try { + snpsOutput = new FileWriter(new File(OUT_SNPS)); + } catch (Exception e) { + logger.error("Failed to create output file "+ OUT_SNPS+". Cleaning snps output will be suppressed"); + logger.error(e.getMessage()); + snpsOutput = null; + } + } + } + + private void setupWriter(SAMFileHeader header) { + + if ( !NO_PG_TAG ) { + final SAMProgramRecord programRecord = createProgramRecord(); + + List oldRecords = header.getProgramRecords(); + List newRecords = new ArrayList(oldRecords.size()+1); + for ( SAMProgramRecord record : oldRecords ) { + if ( !record.getId().startsWith(PROGRAM_RECORD_NAME) || KEEP_ALL_PG_RECORDS ) + newRecords.add(record); + } + newRecords.add(programRecord); + header.setProgramRecords(newRecords); + } + + writer.writeHeader(header); + writer.setPresorted(true); + } + + + private SAMProgramRecord createProgramRecord() { + if ( NO_PG_TAG ) return null; + + final SAMProgramRecord programRecord = new SAMProgramRecord(PROGRAM_RECORD_NAME); + final ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("StingText"); + try { + final String version = headerInfo.getString("org.broadinstitute.sting.gatk.version"); + programRecord.setProgramVersion(version); + } catch (MissingResourceException e) { + // this is left empty on purpose (perhaps Andrey knows why?) + } + programRecord.setCommandLine(getToolkit().createApproximateCommandLineArgumentString(getToolkit(), this)); + return programRecord; + } + + private void emit(final GATKSAMRecord read) { + + // check to see whether the read was modified by looking at the temporary tag + boolean wasModified = readsActuallyCleaned.contains(read); + + try { + manager.addRead(read, wasModified); + } catch (RuntimeIOException e) { + throw new UserException.ErrorWritingBamFile(e.getMessage()); + } + } + + private void emitReadLists() { + // pre-merge lists to sort them in preparation for constrained SAMFileWriter + readsNotToClean.addAll(readsToClean.getReads()); + ReadUtils.sortReadsByCoordinate(readsNotToClean); + manager.addReads(readsNotToClean, readsActuallyCleaned); + readsToClean.clear(); + readsNotToClean.clear(); + readsActuallyCleaned.clear(); + } + + public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) { + if ( currentInterval == null ) { + emit(read); + return 0; + } + + // edge case: when the last target interval abuts the end of the genome, we'll get one of the + // unmapped reads while the currentInterval still isn't null. We need to trigger the cleaning + // at this point without trying to create a GenomeLoc. + if ( read.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX ) { + cleanAndCallMap(ref, read, metaDataTracker, null); + return 0; + } + + GenomeLoc readLoc = getToolkit().getGenomeLocParser().createGenomeLoc(read); + // hack to get around unmapped reads having screwy locations + if ( readLoc.getStop() == 0 ) + readLoc = getToolkit().getGenomeLocParser().createGenomeLoc(readLoc.getContig(), readLoc.getStart(), readLoc.getStart()); + + if ( readLoc.isBefore(currentInterval) ) { + if ( !sawReadInCurrentInterval ) + emit(read); + else + readsNotToClean.add(read); + } + else if ( readLoc.overlapsP(currentInterval) ) { + sawReadInCurrentInterval = true; + + if ( doNotTryToClean(read) ) { + readsNotToClean.add(read); + } else { + readsToClean.add(read); + + // add the rods to the list of known variants + populateKnownIndels(metaDataTracker); + } + + if ( readsToClean.size() + readsNotToClean.size() >= MAX_READS ) { + logger.info("Not attempting realignment in interval " + currentInterval + " because there are too many reads."); + abortCleanForCurrentInterval(); + } + } + else { // the read is past the current interval + logger.debug(currentInterval.toString() + "\t" + read.getAlignmentStart() ); + cleanAndCallMap(ref, read, metaDataTracker, readLoc); + } + + return 0; + } + + private void abortCleanForCurrentInterval() { + emitReadLists(); + currentInterval = intervals.hasNext() ? intervals.next() : null; + sawReadInCurrentInterval = false; + } + + private boolean doNotTryToClean(GATKSAMRecord read) { + return read.getReadUnmappedFlag() || + read.getNotPrimaryAlignmentFlag() || + read.getReadFailsVendorQualityCheckFlag() || + read.getMappingQuality() == 0 || + read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START || + ConstrainedMateFixingManager.iSizeTooBigToMove(read, MAX_ISIZE_FOR_MOVEMENT) || + ReadUtils.is454Read(read) || + ReadUtils.isIonRead(read); + // TODO -- it would be nice if we could use indels from 454/Ion reads as alternate consenses + } + + private void cleanAndCallMap(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker, GenomeLoc readLoc) { + if ( readsToClean.size() > 0 ) { + GenomeLoc earliestPossibleMove = getToolkit().getGenomeLocParser().createGenomeLoc(readsToClean.getReads().get(0)); + if ( manager.canMoveReads(earliestPossibleMove) ) + clean(readsToClean); + } + knownIndelsToTry.clear(); + indelRodsSeen.clear(); + + emitReadLists(); + try { + do { + currentInterval = intervals.hasNext() ? intervals.next() : null; + + } while ( currentInterval != null && (readLoc == null || currentInterval.isBefore(readLoc)) ); + } catch (ReviewedStingException e) { + throw new UserException.MissortedFile(new File(intervalsFile.getSource()), " *** Are you sure that your interval file is sorted? If not, you must use the --targetIntervalsAreNotSorted argument. ***", e); + } + sawReadInCurrentInterval = false; + + // call back into map now that the state has been updated + map(ref, read, metaDataTracker); + } + + public Integer reduceInit() { + return 0; + } + + public Integer reduce(Integer value, Integer sum) { + return sum + value; + } + + public void onTraversalDone(Integer result) { + if ( readsToClean.size() > 0 ) { + GenomeLoc earliestPossibleMove = getToolkit().getGenomeLocParser().createGenomeLoc(readsToClean.getReads().get(0)); + if ( manager.canMoveReads(earliestPossibleMove) ) + clean(readsToClean); + emitReadLists(); + } else if ( readsNotToClean.size() > 0 ) { + emitReadLists(); + } + + knownIndelsToTry.clear(); + indelRodsSeen.clear(); + + if ( OUT_INDELS != null ) { + try { + indelOutput.close(); + } catch (Exception e) { + logger.error("Failed to close "+OUT_INDELS+" gracefully. Data may be corrupt."); + } + } + if ( OUT_STATS != null ) { + try { + statsOutput.close(); + } catch (Exception e) { + logger.error("Failed to close "+OUT_STATS+" gracefully. Data may be corrupt."); + } + } + if ( OUT_SNPS != null ) { + try { + snpsOutput.close(); + } catch (Exception e) { + logger.error("Failed to close "+OUT_SNPS+" gracefully. Data may be corrupt."); + } + } + + manager.close(); + if ( N_WAY_OUT != null ) writerToUse.close(); + + if ( CHECKEARLY ) { + logger.info("SW alignments runs: "+SWalignmentRuns); + logger.info("SW alignments successfull: "+SWalignmentSuccess + " ("+SWalignmentSuccess/SWalignmentRuns+"% of SW runs)"); + logger.info("SW alignments skipped (perfect match): "+exactMatchesFound); + logger.info("Total reads SW worked for: "+(SWalignmentSuccess + exactMatchesFound)+ + " ("+(SWalignmentSuccess+exactMatchesFound)/(SWalignmentRuns+exactMatchesFound)+"% of all reads requiring SW)"); + } + } + + private void populateKnownIndels(RefMetaDataTracker metaDataTracker) { + for ( final VariantContext vc : metaDataTracker.getValues(known) ) { + if ( indelRodsSeen.contains(vc) ) + continue; + indelRodsSeen.add(vc); + knownIndelsToTry.add(vc); + } + } + + private static int mismatchQualitySumIgnoreCigar(final AlignedRead aRead, final byte[] refSeq, int refIndex, int quitAboveThisValue) { + final byte[] readSeq = aRead.getReadBases(); + final byte[] quals = aRead.getBaseQualities(); + int sum = 0; + for (int readIndex = 0 ; readIndex < readSeq.length ; refIndex++, readIndex++ ) { + if ( refIndex >= refSeq.length ) { + sum += MAX_QUAL; + // optimization: once we pass the threshold, stop calculating + if ( sum > quitAboveThisValue ) + return sum; + } else { + byte refChr = refSeq[refIndex]; + byte readChr = readSeq[readIndex]; + if ( !BaseUtils.isRegularBase(readChr) || !BaseUtils.isRegularBase(refChr) ) + continue; // do not count Ns/Xs/etc ? + if ( readChr != refChr ) { + sum += (int)quals[readIndex]; + // optimization: once we pass the threshold, stop calculating + if ( sum > quitAboveThisValue ) + return sum; + } + } + } + return sum; + } + + private void clean(ReadBin readsToClean) { + + final List reads = readsToClean.getReads(); + if ( reads.size() == 0 ) + return; + + byte[] reference = readsToClean.getReference(referenceReader); + int leftmostIndex = readsToClean.getLocation().getStart(); + + final ArrayList refReads = new ArrayList(); // reads that perfectly match ref + final ArrayList altReads = new ArrayList(); // reads that don't perfectly match + final LinkedList altAlignmentsToTest = new LinkedList(); // should we try to make an alt consensus from the read? + final Set altConsenses = new LinkedHashSet(); // list of alt consenses + + // if there are any known indels for this region, get them and create alternate consenses + generateAlternateConsensesFromKnownIndels(altConsenses, leftmostIndex, reference); + + // decide which reads potentially need to be cleaned; + // if there are reads with a single indel in them, add that indel to the list of alternate consenses + long totalRawMismatchSum = determineReadsThatNeedCleaning(reads, refReads, altReads, altAlignmentsToTest, altConsenses, leftmostIndex, reference); + + // use 'Smith-Waterman' to create alternate consenses from reads that mismatch the reference, using totalRawMismatchSum as the random seed + if ( consensusModel == ConsensusDeterminationModel.USE_SW ) + generateAlternateConsensesFromReads(altAlignmentsToTest, altConsenses, reference, leftmostIndex); + + // if ( debugOn ) System.out.println("------\nChecking consenses...\n--------\n"); + + Consensus bestConsensus = null; + + for (Consensus consensus : altConsenses) { + //logger.debug("Trying new consensus: " + consensus.cigar + " " + new String(consensus.str)); + +// if ( DEBUG ) { +// System.out.println("Checking consensus with alignment at "+consensus.positionOnReference+" cigar "+consensus.cigar); +// System.out.println(new String(consensus.str)); +// int z = 0; +// for ( ; z < consensus.positionOnReference; z++ ) System.out.print('.'); +// for ( z=0 ; z < consensus.cigar.getCigarElement(0).getLength() ; z++ ) System.out.print('.'); +// if ( consensus.cigar.getCigarElement(1).getOperator() == CigarOperator.I ) for ( z= 0; z < consensus.cigar.getCigarElement(1).getLength(); z++ ) System.out.print('I'); +// System.out.println(); +// } + + // if ( debugOn ) System.out.println("Consensus: "+consensus.str); + + for (int j = 0; j < altReads.size(); j++) { + AlignedRead toTest = altReads.get(j); + Pair altAlignment = findBestOffset(consensus.str, toTest, leftmostIndex); + + // the mismatch score is the min of its alignment vs. the reference and vs. the alternate + int myScore = altAlignment.second; + + if (myScore > toTest.getAlignerMismatchScore() || myScore >= toTest.getMismatchScoreToReference()) + myScore = toTest.getMismatchScoreToReference(); + // keep track of reads that align better to the alternate consensus. + // By pushing alignments with equal scores to the alternate, it means we'll over-call (het -> hom non ref) but are less likely to under-call (het -> ref, het non ref -> het) + else + consensus.readIndexes.add(new Pair(j, altAlignment.first)); + + //logger.debug(consensus.cigar + " vs. " + toTest.getRead().getReadName() + "-" + toTest.getRead().getReadString() + " => " + myScore + " vs. " + toTest.getMismatchScoreToReference()); + if (!toTest.getRead().getDuplicateReadFlag()) + consensus.mismatchSum += myScore; + + // optimization: once the mismatch sum is higher than the best consensus, quit since this one can't win + // THIS MUST BE DISABLED IF WE DECIDE TO ALLOW MORE THAN ONE ALTERNATE CONSENSUS! + if (bestConsensus != null && consensus.mismatchSum > bestConsensus.mismatchSum) + break; + } + + //logger.debug("Mismatch sum of new consensus: " + consensus.mismatchSum); + if (bestConsensus == null || bestConsensus.mismatchSum > consensus.mismatchSum) { + // we do not need this alt consensus, release memory right away!! + if (bestConsensus != null) + bestConsensus.readIndexes.clear(); + bestConsensus = consensus; + //logger.debug("New consensus " + bestConsensus.cigar + " is now best consensus"); + } else { + // we do not need this alt consensus, release memory right away!! + consensus.readIndexes.clear(); + } + } + + // if: + // 1) the best alternate consensus has a smaller sum of quality score mismatches than the aligned version of the reads, + // 2) beats the LOD threshold for the sum of quality score mismatches of the raw version of the reads, + // 3) didn't just move around the mismatching columns (i.e. it actually reduces entropy), + // then clean! + final double improvement = (bestConsensus == null ? -1 : ((double)(totalRawMismatchSum - bestConsensus.mismatchSum))/10.0); + if ( improvement >= LOD_THRESHOLD ) { + + bestConsensus.cigar = AlignmentUtils.leftAlignIndel(bestConsensus.cigar, reference, bestConsensus.str, bestConsensus.positionOnReference, bestConsensus.positionOnReference, true); + + // start cleaning the appropriate reads + for ( Pair indexPair : bestConsensus.readIndexes ) { + AlignedRead aRead = altReads.get(indexPair.first); + if ( !updateRead(bestConsensus.cigar, bestConsensus.positionOnReference, indexPair.second, aRead, leftmostIndex) ) + return; + } + if ( consensusModel != ConsensusDeterminationModel.KNOWNS_ONLY && !alternateReducesEntropy(altReads, reference, leftmostIndex) ) { + if ( statsOutput != null ) { + try { + statsOutput.write(currentInterval.toString()); + statsOutput.write("\tFAIL (bad indel)\t"); // if improvement > LOD_THRESHOLD *BUT* entropy is not reduced (SNPs still exist) + statsOutput.write(Double.toString(improvement)); + statsOutput.write("\n"); + statsOutput.flush(); + } catch (Exception e) { + throw new UserException.CouldNotCreateOutputFile("statsOutput", "Failed to write stats output file", e); + } + } + } else { + //logger.debug("CLEAN: " + bestConsensus.cigar + " " + bestConsensus.str.toString() + " " + bestConsensus.cigar.numCigarElements() ); + if ( indelOutput != null && bestConsensus.cigar.numCigarElements() > 1 ) { + // NOTE: indels are printed out in the format specified for the low-coverage pilot1 + // indel calls (tab-delimited): chr position size type sequence + StringBuilder str = new StringBuilder(); + str.append(reads.get(0).getReferenceName()); + int position = bestConsensus.positionOnReference + bestConsensus.cigar.getCigarElement(0).getLength(); + str.append("\t").append(leftmostIndex + position - 1); + CigarElement ce = bestConsensus.cigar.getCigarElement(1); + str.append("\t").append(ce.getLength()).append("\t").append(ce.getOperator()).append("\t"); + int length = ce.getLength(); + if ( ce.getOperator() == CigarOperator.D ) { + for ( int i = 0; i < length; i++) + str.append((char)reference[position+i]); + } else { + for ( int i = 0; i < length; i++) + str.append((char)bestConsensus.str[position+i]); + } + str.append("\t").append((((double) (totalRawMismatchSum - bestConsensus.mismatchSum)) / 10.0)).append("\n"); + try { + indelOutput.write(str.toString()); + indelOutput.flush(); + } catch (Exception e) { + throw new UserException.CouldNotCreateOutputFile("indelOutput", "Failed to write indel output file", e); + } + } + if ( statsOutput != null ) { + try { + statsOutput.write(currentInterval.toString()); + statsOutput.write("\tCLEAN"); // if improvement > LOD_THRESHOLD *AND* entropy is reduced + if ( bestConsensus.cigar.numCigarElements() > 1 ) + statsOutput.write(" (found indel)"); + statsOutput.write("\t"); + statsOutput.write(Double.toString(improvement)); + statsOutput.write("\n"); + statsOutput.flush(); + } catch (Exception e) { + throw new UserException.CouldNotCreateOutputFile("statsOutput", "Failed to write stats output file", e); + } + } + + // finish cleaning the appropriate reads + for ( Pair indexPair : bestConsensus.readIndexes ) { + final AlignedRead aRead = altReads.get(indexPair.first); + if ( aRead.finalizeUpdate() ) { + // We need to update the mapping quality score of the cleaned reads; + // however we don't have enough info to use the proper MAQ scoring system. + // For now, we will just arbitrarily add 10 to the mapping quality. [EB, 6/7/2010]. + // TODO -- we need a better solution here + GATKSAMRecord read = aRead.getRead(); + if ( read.getMappingQuality() != 255 ) // 255 == Unknown, so don't modify it + read.setMappingQuality(Math.min(aRead.getRead().getMappingQuality() + 10, 254)); + + // before we fix the attribute tags we first need to make sure we have enough of the reference sequence + int neededBasesToLeft = leftmostIndex - read.getAlignmentStart(); + int neededBasesToRight = read.getAlignmentEnd() - leftmostIndex - reference.length + 1; + int neededBases = Math.max(neededBasesToLeft, neededBasesToRight); + if ( neededBases > 0 ) { + int padLeft = Math.max(leftmostIndex-neededBases, 1); + int padRight = Math.min(leftmostIndex+reference.length+neededBases, referenceReader.getSequenceDictionary().getSequence(currentInterval.getContig()).getSequenceLength()); + reference = referenceReader.getSubsequenceAt(currentInterval.getContig(), padLeft, padRight).getBases(); + leftmostIndex = padLeft; + } + + // now, fix the attribute tags + // TODO -- get rid of this try block when Picard does the right thing for reads aligned off the end of the reference + try { + if ( read.getAttribute(SAMTag.NM.name()) != null ) + read.setAttribute(SAMTag.NM.name(), SequenceUtil.calculateSamNmTag(read, reference, leftmostIndex - 1)); + if ( read.getAttribute(SAMTag.UQ.name()) != null ) + read.setAttribute(SAMTag.UQ.name(), SequenceUtil.sumQualitiesOfMismatches(read, reference, leftmostIndex-1)); + } catch (Exception e) { + // ignore it + } + // TODO -- this is only temporary until Tim adds code to recalculate this value + if ( read.getAttribute(SAMTag.MD.name()) != null ) + read.setAttribute(SAMTag.MD.name(), null); + + // mark that it was actually cleaned + readsActuallyCleaned.add(read); + } + } + } + + // END IF ( improvement >= LOD_THRESHOLD ) + + } else if ( statsOutput != null ) { + try { + statsOutput.write(String.format("%s\tFAIL\t%.1f%n", + currentInterval.toString(), improvement)); + statsOutput.flush(); + } catch (Exception e) { + throw new UserException.CouldNotCreateOutputFile("statsOutput", "Failed to write stats output file", e); + } + } + } + + private void generateAlternateConsensesFromKnownIndels(final Set altConsensesToPopulate, final int leftmostIndex, final byte[] reference) { + for ( VariantContext knownIndel : knownIndelsToTry ) { + if ( knownIndel == null || !knownIndel.isIndel() || knownIndel.isComplexIndel() ) + continue; + final byte[] indelStr; + if ( knownIndel.isSimpleInsertion() ) { + final byte[] fullAllele = knownIndel.getAlternateAllele(0).getBases(); + indelStr = Arrays.copyOfRange(fullAllele, 1, fullAllele.length); // remove ref padding + } else { + indelStr = Utils.dupBytes((byte)'-', knownIndel.getReference().length() - 1); + } + int start = knownIndel.getStart() - leftmostIndex + 1; + Consensus c = createAlternateConsensus(start, reference, indelStr, knownIndel); + if ( c != null ) + altConsensesToPopulate.add(c); + } + } + + private long determineReadsThatNeedCleaning(final List reads, + final ArrayList refReadsToPopulate, + final ArrayList altReadsToPopulate, + final LinkedList altAlignmentsToTest, + final Set altConsenses, + final int leftmostIndex, + final byte[] reference) { + + long totalRawMismatchSum = 0L; + for ( final GATKSAMRecord read : reads ) { + + // we can not deal with screwy records + if ( read.getCigar().numCigarElements() == 0 ) { + refReadsToPopulate.add(read); + continue; + } + + final AlignedRead aRead = new AlignedRead(read); + + // first, move existing indels (for 1 indel reads only) to leftmost position within identical sequence + int numBlocks = AlignmentUtils.getNumAlignmentBlocks(read); + if ( numBlocks == 2 ) { + Cigar newCigar = AlignmentUtils.leftAlignIndel(unclipCigar(read.getCigar()), reference, read.getReadBases(), read.getAlignmentStart()-leftmostIndex, 0, true); + aRead.setCigar(newCigar, false); + } + + final int startOnRef = read.getAlignmentStart()-leftmostIndex; + final int rawMismatchScore = mismatchQualitySumIgnoreCigar(aRead, reference, startOnRef, Integer.MAX_VALUE); + + // if this doesn't match perfectly to the reference, let's try to clean it + if ( rawMismatchScore > 0 ) { + altReadsToPopulate.add(aRead); + //logger.debug("Adding " + read.getReadName() + " with raw mismatch score " + rawMismatchScore + " to non-ref reads"); + + if ( !read.getDuplicateReadFlag() ) + totalRawMismatchSum += rawMismatchScore; + aRead.setMismatchScoreToReference(rawMismatchScore); + aRead.setAlignerMismatchScore(AlignmentUtils.mismatchingQualities(aRead.getRead(), reference, startOnRef)); + + // if it has an indel, let's see if that's the best consensus + if ( consensusModel != ConsensusDeterminationModel.KNOWNS_ONLY && numBlocks == 2 ) { + Consensus c = createAlternateConsensus(startOnRef, aRead.getCigar(), reference, aRead.getReadBases()); + if ( c != null ) + altConsenses.add(c); + } else { + altAlignmentsToTest.add(aRead); + } + } + // otherwise, we can emit it as is + else { + //logger.debug("Adding " + read.getReadName() + " with raw mismatch score " + rawMismatchScore + " to ref reads"); + refReadsToPopulate.add(read); + } + } + + return totalRawMismatchSum; + } + + private void generateAlternateConsensesFromReads(final LinkedList altAlignmentsToTest, + final Set altConsensesToPopulate, + final byte[] reference, + final int leftmostIndex) { + + // if we are under the limit, use all reads to generate alternate consenses + if ( altAlignmentsToTest.size() <= MAX_READS_FOR_CONSENSUSES ) { + for ( AlignedRead aRead : altAlignmentsToTest ) { + if ( CHECKEARLY ) createAndAddAlternateConsensus1(aRead, altConsensesToPopulate, reference,leftmostIndex); + else createAndAddAlternateConsensus(aRead.getReadBases(), altConsensesToPopulate, reference); + } + } + // otherwise, choose reads for alternate consenses randomly + else { + int readsSeen = 0; + while ( readsSeen++ < MAX_READS_FOR_CONSENSUSES && altConsensesToPopulate.size() <= MAX_CONSENSUSES) { + int index = GenomeAnalysisEngine.getRandomGenerator().nextInt(altAlignmentsToTest.size()); + AlignedRead aRead = altAlignmentsToTest.remove(index); + if ( CHECKEARLY ) createAndAddAlternateConsensus1(aRead, altConsensesToPopulate, reference,leftmostIndex); + else createAndAddAlternateConsensus(aRead.getReadBases(), altConsensesToPopulate, reference); + } + } + } + + private void createAndAddAlternateConsensus(final byte[] read, final Set altConsensesToPopulate, final byte[] reference) { + + // do a pairwise alignment against the reference + SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, read, swParameters); + Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, read); + if ( c != null ) + altConsensesToPopulate.add(c); + } + + private void createAndAddAlternateConsensus1(AlignedRead read, final Set altConsensesToPopulate, + final byte[] reference, final int leftmostIndex) { + + for ( Consensus known : altConsensesToPopulate ) { + Pair altAlignment = findBestOffset(known.str, read, leftmostIndex); + // the mismatch score is the min of its alignment vs. the reference and vs. the alternate + int myScore = altAlignment.second; + if ( myScore == 0 ) {exactMatchesFound++; return; }// read matches perfectly to a known alt consensus - no need to run SW, we already know the answer + } + // do a pairwise alignment against the reference + SWalignmentRuns++; + SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, read.getReadBases(), swParameters); + Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, read.getReadBases()); + if ( c != null ) { + altConsensesToPopulate.add(c); + SWalignmentSuccess++; + } + } + + // create a Consensus from cigar/read strings which originate somewhere on the reference + private Consensus createAlternateConsensus(final int indexOnRef, final Cigar c, final byte[] reference, final byte[] readStr) { + if ( indexOnRef < 0 ) + return null; + + // if there are no indels, we do not need this consensus, can abort early: + if ( c.numCigarElements() == 1 && c.getCigarElement(0).getOperator() == CigarOperator.M ) return null; + + // create the new consensus + ArrayList elements = new ArrayList(c.numCigarElements()-1); + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < indexOnRef; i++) + sb.append((char)reference[i]); + + int indelCount = 0; + int altIdx = 0; + int refIdx = indexOnRef; + boolean ok_flag = true; + for ( int i = 0 ; i < c.numCigarElements() ; i++ ) { + CigarElement ce = c.getCigarElement(i); + int elementLength = ce.getLength(); + switch( ce.getOperator() ) { + case D: + refIdx += elementLength; + indelCount++; + elements.add(ce); + break; + case M: + case EQ: + case X: + altIdx += elementLength; + case N: + if ( reference.length < refIdx + elementLength ) + ok_flag = false; + else { + for (int j = 0; j < elementLength; j++) + sb.append((char)reference[refIdx+j]); + } + refIdx += elementLength; + elements.add(new CigarElement(elementLength, CigarOperator.M)); + break; + case I: + for (int j = 0; j < elementLength; j++) { + if ( ! BaseUtils.isRegularBase(readStr[altIdx+j]) ) { + // Insertions with N's in them cause real problems sometimes; it's better to drop them altogether + ok_flag=false; + break; + } + sb.append((char)readStr[altIdx + j]); + } + altIdx += elementLength; + indelCount++; + elements.add(ce); + break; + case S: + default: + break; + } + } + // make sure that there is at most only a single indel and it aligns appropriately! + if ( !ok_flag || indelCount != 1 || reference.length < refIdx ) + return null; + + for (int i = refIdx; i < reference.length; i++) + sb.append((char)reference[i]); + byte[] altConsensus = StringUtil.stringToBytes(sb.toString()); // alternative consensus sequence we just built from the current read + + return new Consensus(altConsensus, new Cigar(elements), indexOnRef); + } + + // create a Consensus from just the indel string that falls on the reference + private Consensus createAlternateConsensus(final int indexOnRef, final byte[] reference, final byte[] indelStr, final VariantContext indel) { + if ( indexOnRef < 0 || indexOnRef >= reference.length ) + return null; + + // create the new consensus + StringBuilder sb = new StringBuilder(); + Cigar cigar = new Cigar(); + int refIdx; + + for (refIdx = 0; refIdx < indexOnRef; refIdx++) + sb.append((char)reference[refIdx]); + if ( indexOnRef > 0 ) + cigar.add(new CigarElement(indexOnRef, CigarOperator.M)); + + if ( indel.isSimpleDeletion() ) { + refIdx += indelStr.length; + cigar.add(new CigarElement(indelStr.length, CigarOperator.D)); + } + else if ( indel.isSimpleInsertion() ) { + for ( byte b : indelStr ) + sb.append((char)b); + cigar.add(new CigarElement(indelStr.length, CigarOperator.I)); + } else { + throw new IllegalStateException("Creating an alternate consensus from a complex indel is not allows"); + } + + if ( reference.length - refIdx > 0 ) + cigar.add(new CigarElement(reference.length - refIdx, CigarOperator.M)); + for (; refIdx < reference.length; refIdx++) + sb.append((char)reference[refIdx]); + byte[] altConsensus = StringUtil.stringToBytes(sb.toString()); // alternative consensus sequence we just built from the current read + + return new Consensus(altConsensus, cigar, 0); + } + + private Pair findBestOffset(final byte[] ref, final AlignedRead read, final int leftmostIndex) { + + // optimization: try the most likely alignment first (to get a low score to beat) + int originalAlignment = read.getOriginalAlignmentStart() - leftmostIndex; + int bestScore = mismatchQualitySumIgnoreCigar(read, ref, originalAlignment, Integer.MAX_VALUE); + int bestIndex = originalAlignment; + + // optimization: we can't get better than 0, so we can quit now + if ( bestScore == 0 ) + return new Pair(bestIndex, 0); + + // optimization: the correct alignment shouldn't be too far from the original one (or else the read wouldn't have aligned in the first place) + for ( int i = 0; i < originalAlignment; i++ ) { + int score = mismatchQualitySumIgnoreCigar(read, ref, i, bestScore); + if ( score < bestScore ) { + bestScore = score; + bestIndex = i; + } + // optimization: we can't get better than 0, so we can quit now + if ( bestScore == 0 ) + return new Pair(bestIndex, 0); + } + + final int maxPossibleStart = ref.length - read.getReadLength(); + for ( int i = originalAlignment + 1; i <= maxPossibleStart; i++ ) { + int score = mismatchQualitySumIgnoreCigar(read, ref, i, bestScore); + if ( score < bestScore ) { + bestScore = score; + bestIndex = i; + } + // optimization: we can't get better than 0, so we can quit now + if ( bestScore == 0 ) + return new Pair(bestIndex, 0); + } + + return new Pair(bestIndex, bestScore); + } + + + private boolean updateRead(final Cigar altCigar, final int altPosOnRef, final int myPosOnAlt, final AlignedRead aRead, final int leftmostIndex) { + Cigar readCigar = new Cigar(); + + // special case: there is no indel + if ( altCigar.getCigarElements().size() == 1 ) { + aRead.setAlignmentStart(leftmostIndex + myPosOnAlt); + readCigar.add(new CigarElement(aRead.getReadLength(), CigarOperator.M)); + aRead.setCigar(readCigar); + return true; + } + + CigarElement altCE1 = altCigar.getCigarElement(0); + CigarElement altCE2 = altCigar.getCigarElement(1); + + int leadingMatchingBlockLength = 0; // length of the leading M element or 0 if the leading element is I + + CigarElement indelCE; + if ( altCE1.getOperator() == CigarOperator.I ) { + indelCE=altCE1; + if ( altCE2.getOperator() != CigarOperator.M ) { + logger.warn("When the first element of the alt consensus is I, the second one must be M. Actual: " + altCigar.toString() + ". Skipping this site..."); + return false; + } + } + else { + if ( altCE1.getOperator() != CigarOperator.M ) { + logger.warn("First element of the alt consensus cigar must be M or I. Actual: " + altCigar.toString() + ". Skipping this site..."); + return false; + } + if ( altCE2.getOperator() == CigarOperator.I || altCE2.getOperator() == CigarOperator.D ) { + indelCE=altCE2; + } else { + logger.warn("When first element of the alt consensus is M, the second one must be I or D. Actual: " + altCigar.toString() + ". Skipping this site..."); + return false; + } + leadingMatchingBlockLength = altCE1.getLength(); + } + + // the easiest thing to do is to take each case separately + int endOfFirstBlock = altPosOnRef + leadingMatchingBlockLength; + boolean sawAlignmentStart = false; + + // for reads starting before the indel + if ( myPosOnAlt < endOfFirstBlock) { + aRead.setAlignmentStart(leftmostIndex + myPosOnAlt); + sawAlignmentStart = true; + + // for reads ending before the indel + if ( myPosOnAlt + aRead.getReadLength() <= endOfFirstBlock) { + //readCigar.add(new CigarElement(aRead.getReadLength(), CigarOperator.M)); + //aRead.setCigar(readCigar); + aRead.setCigar(null); // reset to original alignment + return true; + } + readCigar.add(new CigarElement(endOfFirstBlock - myPosOnAlt, CigarOperator.M)); + } + + // forward along the indel + //int indelOffsetOnRef = 0, indelOffsetOnRead = 0; + if ( indelCE.getOperator() == CigarOperator.I ) { + // for reads that end in an insertion + if ( myPosOnAlt + aRead.getReadLength() < endOfFirstBlock + indelCE.getLength() ) { + int partialInsertionLength = myPosOnAlt + aRead.getReadLength() - endOfFirstBlock; + // if we also started inside the insertion, then we need to modify the length + if ( !sawAlignmentStart ) + partialInsertionLength = aRead.getReadLength(); + readCigar.add(new CigarElement(partialInsertionLength, CigarOperator.I)); + aRead.setCigar(readCigar); + return true; + } + + // for reads that start in an insertion + if ( !sawAlignmentStart && myPosOnAlt < endOfFirstBlock + indelCE.getLength() ) { + aRead.setAlignmentStart(leftmostIndex + endOfFirstBlock); + readCigar.add(new CigarElement(indelCE.getLength() - (myPosOnAlt - endOfFirstBlock), CigarOperator.I)); + //indelOffsetOnRead = myPosOnAlt - endOfFirstBlock; + sawAlignmentStart = true; + } else if ( sawAlignmentStart ) { + readCigar.add(indelCE); + //indelOffsetOnRead = indelCE.getLength(); + } + } else if ( indelCE.getOperator() == CigarOperator.D ) { + if ( sawAlignmentStart ) + readCigar.add(indelCE); + //indelOffsetOnRef = indelCE.getLength(); + } + + // for reads that start after the indel + if ( !sawAlignmentStart ) { + //aRead.setAlignmentStart(leftmostIndex + myPosOnAlt + indelOffsetOnRef - indelOffsetOnRead); + //readCigar.add(new CigarElement(aRead.getReadLength(), CigarOperator.M)); + //aRead.setCigar(readCigar); + aRead.setCigar(null); // reset to original alignment + return true; + } + + int readRemaining = aRead.getReadBases().length; + for ( CigarElement ce : readCigar.getCigarElements() ) { + if ( ce.getOperator() != CigarOperator.D ) + readRemaining -= ce.getLength(); + } + if ( readRemaining > 0 ) + readCigar.add(new CigarElement(readRemaining, CigarOperator.M)); + aRead.setCigar(readCigar); + + return true; + } + + private boolean alternateReducesEntropy(final List reads, final byte[] reference, final int leftmostIndex) { + final int[] originalMismatchBases = new int[reference.length]; + final int[] cleanedMismatchBases = new int[reference.length]; + final int[] totalOriginalBases = new int[reference.length]; + final int[] totalCleanedBases = new int[reference.length]; + + // set to 1 to prevent dividing by zero + for ( int i=0; i < reference.length; i++ ) + originalMismatchBases[i] = totalOriginalBases[i] = cleanedMismatchBases[i] = totalCleanedBases[i] = 0; + + for (final AlignedRead read : reads) { + if (read.getRead().getAlignmentBlocks().size() > 1) + continue; + + int refIdx = read.getOriginalAlignmentStart() - leftmostIndex; + final byte[] readStr = read.getReadBases(); + final byte[] quals = read.getBaseQualities(); + + for (int j = 0; j < readStr.length; j++, refIdx++) { + if (refIdx < 0 || refIdx >= reference.length) { + //System.out.println( "Read: "+read.getRead().getReadName() + "; length = " + readStr.length() ); + //System.out.println( "Ref left: "+ leftmostIndex +"; ref length=" + reference.length() + "; read alignment start: "+read.getOriginalAlignmentStart() ); + break; + } + totalOriginalBases[refIdx] += quals[j]; + if (readStr[j] != reference[refIdx]) + originalMismatchBases[refIdx] += quals[j]; + } + + // reset and now do the calculation based on the cleaning + refIdx = read.getAlignmentStart() - leftmostIndex; + int altIdx = 0; + Cigar c = read.getCigar(); + for (int j = 0; j < c.numCigarElements(); j++) { + CigarElement ce = c.getCigarElement(j); + int elementLength = ce.getLength(); + switch (ce.getOperator()) { + case M: + case EQ: + case X: + for (int k = 0; k < elementLength; k++, refIdx++, altIdx++) { + if (refIdx >= reference.length) + break; + totalCleanedBases[refIdx] += quals[altIdx]; + if (readStr[altIdx] != reference[refIdx]) + cleanedMismatchBases[refIdx] += quals[altIdx]; + } + break; + case I: + altIdx += elementLength; + break; + case D: + refIdx += elementLength; + break; + case S: + default: + break; + } + } + } + + int originalMismatchColumns = 0, cleanedMismatchColumns = 0; + StringBuilder sb = new StringBuilder(); + for ( int i=0; i < reference.length; i++ ) { + if ( cleanedMismatchBases[i] == originalMismatchBases[i] ) + continue; + boolean didMismatch = false, stillMismatches = false; + if ( originalMismatchBases[i] > totalOriginalBases[i] * MISMATCH_THRESHOLD ) { + didMismatch = true; + originalMismatchColumns++; + if ( totalCleanedBases[i] > 0 && ((double)cleanedMismatchBases[i] / (double)totalCleanedBases[i]) > ((double)originalMismatchBases[i] / (double)totalOriginalBases[i]) * (1.0 - MISMATCH_COLUMN_CLEANED_FRACTION) ) { + stillMismatches = true; + cleanedMismatchColumns++; + } + } else if ( cleanedMismatchBases[i] > totalCleanedBases[i] * MISMATCH_THRESHOLD ) { + cleanedMismatchColumns++; + } + if ( snpsOutput != null ) { + if ( didMismatch ) { + sb.append(reads.get(0).getRead().getReferenceName()).append(":").append(leftmostIndex + i); + if ( stillMismatches ) + sb.append(" SAME_SNP\n"); + else + sb.append(" NOT_SNP\n"); + } + } + } + + //logger.debug("Original mismatch columns = " + originalMismatchColumns + "; cleaned mismatch columns = " + cleanedMismatchColumns); + + final boolean reduces = (originalMismatchColumns == 0 || cleanedMismatchColumns < originalMismatchColumns); + if ( reduces && snpsOutput != null ) { + try { + snpsOutput.write(sb.toString()); + snpsOutput.flush(); + } catch (Exception e) { + throw new UserException.CouldNotCreateOutputFile("snpsOutput", "Failed to write SNPs output file", e); + } + } + return reduces; + } + + protected static Cigar unclipCigar(Cigar cigar) { + ArrayList elements = new ArrayList(cigar.numCigarElements()); + for ( CigarElement ce : cigar.getCigarElements() ) { + if ( !isClipOperator(ce.getOperator()) ) + elements.add(ce); + } + return new Cigar(elements); + } + + private static boolean isClipOperator(CigarOperator op) { + return op == CigarOperator.S || op == CigarOperator.H || op == CigarOperator.P; + } + + protected static Cigar reclipCigar(Cigar cigar, SAMRecord read) { + ArrayList elements = new ArrayList(); + + int i = 0; + int n = read.getCigar().numCigarElements(); + while ( i < n && isClipOperator(read.getCigar().getCigarElement(i).getOperator()) ) + elements.add(read.getCigar().getCigarElement(i++)); + + elements.addAll(cigar.getCigarElements()); + + i++; + while ( i < n && !isClipOperator(read.getCigar().getCigarElement(i).getOperator()) ) + i++; + + while ( i < n && isClipOperator(read.getCigar().getCigarElement(i).getOperator()) ) + elements.add(read.getCigar().getCigarElement(i++)); + + return new Cigar(elements); + } + + private class AlignedRead { + private final GATKSAMRecord read; + private byte[] readBases = null; + private byte[] baseQuals = null; + private Cigar newCigar = null; + private int newStart = -1; + private int mismatchScoreToReference = 0; + private long alignerMismatchScore = 0; + + public AlignedRead(GATKSAMRecord read) { + this.read = read; + mismatchScoreToReference = 0; + } + + public GATKSAMRecord getRead() { + return read; + } + + public int getReadLength() { + return readBases != null ? readBases.length : read.getReadLength(); + } + + public byte[] getReadBases() { + if ( readBases == null ) + getUnclippedBases(); + return readBases; + } + + public byte[] getBaseQualities() { + if ( baseQuals == null ) + getUnclippedBases(); + return baseQuals; + } + + // pull out the bases that aren't clipped out + private void getUnclippedBases() { + readBases = new byte[getReadLength()]; + baseQuals = new byte[getReadLength()]; + byte[] actualReadBases = read.getReadBases(); + byte[] actualBaseQuals = read.getBaseQualities(); + int fromIndex = 0, toIndex = 0; + + for ( CigarElement ce : read.getCigar().getCigarElements() ) { + int elementLength = ce.getLength(); + switch ( ce.getOperator() ) { + case S: + fromIndex += elementLength; + break; + case M: + case EQ: + case X: + case I: + System.arraycopy(actualReadBases, fromIndex, readBases, toIndex, elementLength); + System.arraycopy(actualBaseQuals, fromIndex, baseQuals, toIndex, elementLength); + fromIndex += elementLength; + toIndex += elementLength; + default: + break; + } + } + + // if we got clipped, trim the array + if ( fromIndex != toIndex ) { + byte[] trimmedRB = new byte[toIndex]; + byte[] trimmedBQ = new byte[toIndex]; + System.arraycopy(readBases, 0, trimmedRB, 0, toIndex); + System.arraycopy(baseQuals, 0, trimmedBQ, 0, toIndex); + readBases = trimmedRB; + baseQuals = trimmedBQ; + } + } + + public Cigar getCigar() { + return (newCigar != null ? newCigar : read.getCigar()); + } + + public void setCigar(Cigar cigar) { + setCigar(cigar, true); + } + + // tentatively sets the new Cigar, but it needs to be confirmed later + public void setCigar(Cigar cigar, boolean fixClippedCigar) { + if ( cigar == null ) { + newCigar = null; + return; + } + + if ( fixClippedCigar && getReadBases().length < read.getReadLength() ) + cigar = reclipCigar(cigar); + + // no change? + if ( read.getCigar().equals(cigar) ) { + newCigar = null; + return; + } + + // no indel? + String str = cigar.toString(); + if ( !str.contains("D") && !str.contains("I") ) { + logger.debug("Modifying a read with no associated indel; although this is possible, it is highly unlikely. Perhaps this region should be double-checked: " + read.getReadName() + " near " + read.getReferenceName() + ":" + read.getAlignmentStart()); + // newCigar = null; + // return; + } + + newCigar = cigar; + } + + // pull out the bases that aren't clipped out + private Cigar reclipCigar(Cigar cigar) { + return IndelRealigner.reclipCigar(cigar, read); + } + + // tentatively sets the new start, but it needs to be confirmed later + public void setAlignmentStart(int start) { + newStart = start; + } + + public int getAlignmentStart() { + return (newStart != -1 ? newStart : read.getAlignmentStart()); + } + + public int getOriginalAlignmentStart() { + return read.getAlignmentStart(); + } + + // finalizes the changes made. + // returns true if this record actually changes, false otherwise + public boolean finalizeUpdate() { + // if we haven't made any changes, don't do anything + if ( newCigar == null ) + return false; + if ( newStart == -1 ) + newStart = read.getAlignmentStart(); + else if ( Math.abs(newStart - read.getAlignmentStart()) > MAX_POS_MOVE_ALLOWED ) { + logger.debug(String.format("Attempting to realign read %s at %d more than %d bases to %d.", read.getReadName(), read.getAlignmentStart(), MAX_POS_MOVE_ALLOWED, newStart)); + return false; + } + + // store the old CIGAR and start in case we need to back out + final Cigar oldCigar = read.getCigar(); + final int oldStart = read.getAlignmentStart(); + + // try updating the read with the new CIGAR and start + read.setCigar(newCigar); + read.setAlignmentStart(newStart); + + // back out if necessary + if ( realignmentProducesBadAlignment(read) ) { + read.setCigar(oldCigar); + read.setAlignmentStart(oldStart); + return false; + } + + // annotate the record with the original cigar and start (if it changed) + if ( !NO_ORIGINAL_ALIGNMENT_TAGS ) { + read.setAttribute(ORIGINAL_CIGAR_TAG, oldCigar.toString()); + if ( newStart != oldStart ) + read.setAttribute(ORIGINAL_POSITION_TAG, oldStart); + } + + return true; + } + + public void setMismatchScoreToReference(int score) { + mismatchScoreToReference = score; + } + + public int getMismatchScoreToReference() { + return mismatchScoreToReference; + } + + public void setAlignerMismatchScore(long score) { + alignerMismatchScore = score; + } + + public long getAlignerMismatchScore() { + return alignerMismatchScore; + } + } + + /** + * Determines whether the read aligns off the end of the contig + * + * @param read the read to check + * @return true if it aligns off the end + */ + private boolean realignmentProducesBadAlignment(final GATKSAMRecord read) { + final int contigLength = referenceReader.getSequenceDictionary().getSequence(currentInterval.getContig()).getSequenceLength(); + return realignmentProducesBadAlignment(read, contigLength); + } + + /** + * Determines whether the read aligns off the end of the contig. + * Pulled out to make it testable. + * + * @param read the read to check + * @return true if it aligns off the end + */ + protected static boolean realignmentProducesBadAlignment(final GATKSAMRecord read, final int contigLength) { + return read.getAlignmentEnd() > contigLength; + } + + private static class Consensus { + public final byte[] str; + public final ArrayList> readIndexes; + public final int positionOnReference; + public int mismatchSum; + public Cigar cigar; + + public Consensus(byte[] str, Cigar cigar, int positionOnReference) { + this.str = str; + this.cigar = cigar; + this.positionOnReference = positionOnReference; + mismatchSum = 0; + readIndexes = new ArrayList>(); + } + + @Override + public boolean equals(Object o) { + return ( this == o || (o instanceof Consensus && Arrays.equals(this.str,(((Consensus)o).str)) ) ); + } + + public boolean equals(Consensus c) { + return ( this == c || Arrays.equals(this.str,c.str) ) ; + } + + @Override + public int hashCode() { + return Arrays.hashCode(this.str); + } + } + +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java new file mode 100644 index 000000000..318779cd2 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -0,0 +1,535 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.indels; + +import com.google.java.contract.Ensures; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.clipping.ReadClipper; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.pairhmm.ArrayLoglessPairHMM; +import org.broadinstitute.sting.utils.pairhmm.Log10PairHMM; +import org.broadinstitute.sting.utils.pairhmm.LoglessPairHMM; +import org.broadinstitute.sting.utils.pairhmm.PairHMM; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.variant.variantcontext.Allele; + +import java.util.Arrays; +import java.util.LinkedHashMap; +import java.util.LinkedList; +import java.util.Map; + + +public class PairHMMIndelErrorModel { + public static final int BASE_QUAL_THRESHOLD = 20; + + private boolean DEBUG = false; + + private static final int MAX_CACHED_QUAL = 127; + + private static final double baseMatchArray[]; + private static final double baseMismatchArray[]; + + private static final int START_HRUN_GAP_IDX = 4; + private static final int MAX_HRUN_GAP_IDX = 20; + + private static final byte MIN_GAP_OPEN_PENALTY = 30; + private static final byte MIN_GAP_CONT_PENALTY = 10; + private static final byte GAP_PENALTY_HRUN_STEP = 1; // each increase in hrun decreases gap penalty by this. + + private final byte[] GAP_OPEN_PROB_TABLE; + private final byte[] GAP_CONT_PROB_TABLE; + + private final PairHMM pairHMM; + + ///////////////////////////// + // Private Member Variables + ///////////////////////////// + + static { + baseMatchArray = new double[MAX_CACHED_QUAL+1]; + baseMismatchArray = new double[MAX_CACHED_QUAL+1]; + for (int k=1; k <= MAX_CACHED_QUAL; k++) { + double baseProb = Math.pow(10, -k/10.); + + + baseMatchArray[k] = Math.log10(1-baseProb); + baseMismatchArray[k] = Math.log10(baseProb); + } + } + + public PairHMMIndelErrorModel(byte indelGOP, byte indelGCP, boolean deb, final PairHMM.HMM_IMPLEMENTATION hmmType ) { + this.DEBUG = deb; + + switch (hmmType) { + case EXACT: + pairHMM = new Log10PairHMM(true); + break; + case ORIGINAL: + pairHMM = new Log10PairHMM(false); + break; + case LOGLESS_CACHING: + pairHMM = new LoglessPairHMM(); + break; + case ARRAY_LOGLESS: + pairHMM = new ArrayLoglessPairHMM(); + break; + default: + throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the UnifiedGenotyper. Acceptable options are ORIGINAL, EXACT, LOGLESS_CACHING, or ARRAY_LOGLESS."); + } + + // fill gap penalty table, affine naive model: + this.GAP_CONT_PROB_TABLE = new byte[MAX_HRUN_GAP_IDX]; + this.GAP_OPEN_PROB_TABLE = new byte[MAX_HRUN_GAP_IDX]; + + for (int i = 0; i < START_HRUN_GAP_IDX; i++) { + GAP_OPEN_PROB_TABLE[i] = indelGOP; + GAP_CONT_PROB_TABLE[i] = indelGCP; + } + + double step = GAP_PENALTY_HRUN_STEP/10.0; + + // initialize gop and gcp to their default values + byte gop = indelGOP; + byte gcp = indelGCP; + + // all of the following is computed in QUal-space + for (int i=START_HRUN_GAP_IDX; i < MAX_HRUN_GAP_IDX; i++) { + gop -= GAP_PENALTY_HRUN_STEP; + if (gop < MIN_GAP_OPEN_PENALTY) + gop = MIN_GAP_OPEN_PENALTY; + + gcp -= step; + if(gcp < MIN_GAP_CONT_PENALTY) + gcp = MIN_GAP_CONT_PENALTY; + GAP_OPEN_PROB_TABLE[i] = gop; + GAP_CONT_PROB_TABLE[i] = gcp; + } + + } + + static private void getContextHomopolymerLength(final byte[] refBytes, final int[] hrunArray) { + // compute forward hrun length, example: + // AGGTGACCCCCCTGAGAG + // 001000012345000000 + hrunArray[0] = 0; + int[] hforward = new int[hrunArray.length]; + int[] hreverse = new int[hrunArray.length]; + + for (int i = 1; i < refBytes.length; i++) { + if (refBytes[i] == refBytes[i-1]) + hforward[i] = hforward[i-1]+1; + else + hforward[i] = 0; + } + + // do similar thing for reverse length, example: + // AGGTGACCCCCCTGAGAG + // 021000543210000000 + // and then accumulate with forward values. + // Total: + // AGGTGACCCCCCTGAGAG + // 022000555555000000 + for (int i=refBytes.length-1; i > 0; i--) { + if (refBytes[i-1] == refBytes[i]) + hreverse[i-1] += hreverse[i]+1; + } + + for (int i = 1; i < refBytes.length; i++) + hrunArray[i] = hforward[i]+hreverse[i]; + } + + + private void fillGapProbabilities(final int[] hrunProfile, + final byte[] contextLogGapOpenProbabilities, + final byte[] contextLogGapContinuationProbabilities) { + // fill based on lookup table + for (int i = 0; i < hrunProfile.length; i++) { + if (hrunProfile[i] >= MAX_HRUN_GAP_IDX) { + contextLogGapOpenProbabilities[i] = GAP_OPEN_PROB_TABLE[MAX_HRUN_GAP_IDX-1]; + contextLogGapContinuationProbabilities[i] = GAP_CONT_PROB_TABLE[MAX_HRUN_GAP_IDX-1]; + } + else { + contextLogGapOpenProbabilities[i] = GAP_OPEN_PROB_TABLE[hrunProfile[i]]; + contextLogGapContinuationProbabilities[i] = GAP_CONT_PROB_TABLE[hrunProfile[i]]; + } + } + } + + /** + * Trims the haplotypes in the given map to the provided start/stop. + * + * @param haplotypeMap the input map + * @param startLocationInRefForHaplotypes the start location of the trim + * @param stopLocationInRefForHaplotypes the stop location of the trim + * @param ref the reference context (used for debugging only, so can be null) + * @return a non-null mapping corresponding to the trimmed version of the original; + * some elements may be lost if trimming cannot be performed on them (e.g. they fall outside of the region to keep) + */ + protected static Map trimHaplotypes(final Map haplotypeMap, + long startLocationInRefForHaplotypes, + long stopLocationInRefForHaplotypes, + final ReferenceContext ref) { + if ( haplotypeMap == null ) throw new IllegalArgumentException("The input allele to haplotype map cannot be null"); + + final LinkedHashMap trimmedHaplotypeMap = new LinkedHashMap<>(); + for (final Allele a: haplotypeMap.keySet()) { + + final Haplotype haplotype = haplotypeMap.get(a); + + if (stopLocationInRefForHaplotypes > haplotype.getStopPosition()) + stopLocationInRefForHaplotypes = haplotype.getStopPosition(); + + if (startLocationInRefForHaplotypes < haplotype.getStartPosition()) + startLocationInRefForHaplotypes = haplotype.getStartPosition(); + else if (startLocationInRefForHaplotypes > haplotype.getStopPosition()) + startLocationInRefForHaplotypes = haplotype.getStopPosition(); + + final long indStart = startLocationInRefForHaplotypes - haplotype.getStartPosition(); + final long indStop = stopLocationInRefForHaplotypes - haplotype.getStartPosition(); + if ( indStart >= indStop ) + continue; + + // commented out here because we need to make this method static for unit testing + //if (DEBUG) + // System.out.format("indStart: %d indStop: %d WinStart:%d WinStop:%d start: %d stop: %d\n", + // indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes); + + // get the trimmed haplotype-bases array and create a new haplotype based on it. Pack this into the new map + final byte[] trimmedHaplotypeBases = Arrays.copyOfRange(haplotype.getBases(), (int)indStart, (int)indStop); + final Haplotype trimmedHaplotype = new Haplotype(trimmedHaplotypeBases, haplotype.isReference()); + trimmedHaplotypeMap.put(a, trimmedHaplotype); + } + return trimmedHaplotypeMap; + } + + + public synchronized double[] computeDiploidReadHaplotypeLikelihoods(final ReadBackedPileup pileup, + final LinkedHashMap haplotypeMap, + final ReferenceContext ref, + final int eventLength, + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, + final double downsamplingFraction) { + final int numHaplotypes = haplotypeMap.size(); + + final int readCounts[] = new int[pileup.getNumberOfElements()]; + final double[][] readLikelihoods = computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap, readCounts); + perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction); + return getDiploidHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods); + + } + + /** + * Should we clip a downstream portion of a read because it spans off the end of a haplotype? + * + * @param read the read in question + * @param refWindowStop the end of the reference window + * @return true if the read needs to be clipped, false otherwise + */ + protected static boolean mustClipDownstream(final GATKSAMRecord read, final int refWindowStop) { + return ( !read.isEmpty() && read.getSoftStart() < refWindowStop && read.getSoftStart() + read.getReadLength() - 1 > refWindowStop ); + } + + /** + * Should we clip a upstream portion of a read because it spans off the end of a haplotype? + * + * @param read the read in question + * @param refWindowStart the start of the reference window + * @return true if the read needs to be clipped, false otherwise + */ + protected static boolean mustClipUpstream(final GATKSAMRecord read, final int refWindowStart) { + return ( !read.isEmpty() && read.getSoftStart() < refWindowStart && read.getSoftEnd() > refWindowStart ); + } + + @Ensures("result != null && result.length == pileup.getNumberOfElements()") + public synchronized double[][] computeGeneralReadHaplotypeLikelihoods(final ReadBackedPileup pileup, + final LinkedHashMap haplotypeMap, + final ReferenceContext ref, + final int eventLength, + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, + final int[] readCounts) { + final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][haplotypeMap.size()]; + + final LinkedList readList = new LinkedList<>(); + final Map readGCPArrayMap = new LinkedHashMap<>(); + int readIdx=0; + for (PileupElement p: pileup) { + // > 1 when the read is a consensus read representing multiple independent observations + readCounts[readIdx] = p.getRepresentativeCount(); + + // check if we've already computed likelihoods for this pileup element (i.e. for this read at this location) + if (perReadAlleleLikelihoodMap.containsPileupElement(p)) { + Map el = perReadAlleleLikelihoodMap.getLikelihoodsAssociatedWithPileupElement(p); + int j=0; + for (Allele a: haplotypeMap.keySet()) { + readLikelihoods[readIdx][j++] = el.get(a); + } + } + else { + // extra padding on candidate haplotypes to make sure reads are always strictly contained + // in them - a value of 1 will in theory do but we use a slightly higher one just for safety sake, mostly + // in case bases at edge of reads have lower quality. + final int trailingBases = 3; + final int refWindowStart = ref.getWindow().getStart() + trailingBases; + final int refWindowStop = ref.getWindow().getStop() - trailingBases; + + if (DEBUG) { + System.out.format("Read Name:%s, aln start:%d aln stop:%d orig cigar:%s\n",p.getRead().getReadName(), p.getRead().getAlignmentStart(), p.getRead().getAlignmentEnd(), p.getRead().getCigarString()); + } + + GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); + + // if the read extends beyond the downstream (right) end of the reference window, clip it + if ( mustClipDownstream(read, refWindowStop) ) + read = ReadClipper.hardClipByReadCoordinates(read, refWindowStop - read.getSoftStart() + 1, read.getReadLength() - 1); + + // if the read extends beyond the upstream (left) end of the reference window, clip it + if ( mustClipUpstream(read, refWindowStart) ) + read = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, refWindowStart); + + if (read.isEmpty()) + continue; + + // hard-clip low quality ends - this may introduce extra H elements in CIGAR string + read = ReadClipper.hardClipLowQualEnds(read, (byte) BASE_QUAL_THRESHOLD ); + + if (read.isEmpty()) + continue; + + // get bases of candidate haplotypes that overlap with reads + final long readStart = read.getSoftStart(); + final long readEnd = read.getSoftEnd(); + + // see if we want to use soft clipped bases. Aligners may soft clip all bases at insertions because they don't match, + // but they're actually consistent with the insertion! + // Rule: if a read starts in interval [eventStart-eventLength,eventStart+1] and we are at an insertion, we'll use all soft clipped bases at the beginning. + // Conversely, if a read ends at [eventStart,eventStart+eventLength] we'll use all soft clipped bases in the end of the read. + final long eventStartPos = ref.getLocus().getStart(); + + // compute total number of clipped bases (soft or hard clipped) and only use them if necessary + final boolean softClips = useSoftClippedBases(read, eventStartPos, eventLength); + final int numStartSoftClippedBases = softClips ? read.getAlignmentStart()- read.getSoftStart() : 0; + final int numEndSoftClippedBases = softClips ? read.getSoftEnd()- read.getAlignmentEnd() : 0 ; + final byte [] unclippedReadBases = read.getReadBases(); + final byte [] unclippedReadQuals = read.getBaseQualities(); + + /** + * Compute genomic locations that candidate haplotypes will span. + * Read start and stop locations (variables readStart and readEnd) are the original unclipped positions from SAMRecord, + * adjusted by hard clips from Cigar string and by qual-based soft-clipping performed above. + * We will propose haplotypes that overlap the read with some padding. + * True read start = readStart + numStartSoftClippedBases - ReadUtils.getFirstInsertionOffset(read) + * Last term is because if a read starts with an insertion then these bases are not accounted for in readStart. + * trailingBases is a padding constant(=3) and we additionally add abs(eventLength) to both sides of read to be able to + * differentiate context between two haplotypes + */ + final int absEventLength = Math.abs(eventLength); + long startLocationInRefForHaplotypes = Math.max(readStart + numStartSoftClippedBases - trailingBases - ReadUtils.getFirstInsertionOffset(read) - absEventLength, 0); + long stopLocationInRefForHaplotypes = readEnd - numEndSoftClippedBases + trailingBases + ReadUtils.getLastInsertionOffset(read) + absEventLength; + + if (DEBUG) + System.out.format("orig Start:%d orig stop: %d\n", startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes); + + int readLength = read.getReadLength()-numStartSoftClippedBases-numEndSoftClippedBases; + + if (startLocationInRefForHaplotypes < ref.getWindow().getStart()) { + startLocationInRefForHaplotypes = ref.getWindow().getStart(); // read starts before haplotype: read will have to be cut numStartSoftClippedBases += ref.getWindow().getStart() - startLocationInRefForHaplotypes; + } + else if (startLocationInRefForHaplotypes > ref.getWindow().getStop()) { + startLocationInRefForHaplotypes = ref.getWindow().getStop(); // read starts after haplotype: read will have to be clipped completely; + } + + // candidate haplotype cannot go beyond reference context + if (stopLocationInRefForHaplotypes > ref.getWindow().getStop()) { + stopLocationInRefForHaplotypes = ref.getWindow().getStop(); // check also if end of read will go beyond reference context + } + + if (stopLocationInRefForHaplotypes <= startLocationInRefForHaplotypes + readLength) { + stopLocationInRefForHaplotypes = startLocationInRefForHaplotypes + readLength-1; // if there's an insertion in the read, the read stop position will be less than start + read legnth, but we want to compute likelihoods in the whole region that a read might overlap + } + + // ok, we now figured out the total number of clipped bases on both ends. + // Figure out where we want to place the haplotype to score read against + + if (DEBUG) + System.out.format("numStartSoftClippedBases: %d numEndSoftClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n", + numStartSoftClippedBases, numEndSoftClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength()); + + // LinkedHashMap readEl = new LinkedHashMap(); + + /** + * Check if we'll end up with an empty read once all clipping is done + */ + if (numStartSoftClippedBases + numEndSoftClippedBases >= unclippedReadBases.length) { + int j=0; + for (Allele a: haplotypeMap.keySet()) { + perReadAlleleLikelihoodMap.add(p,a,0.0); + readLikelihoods[readIdx][j++] = 0.0; + } + } + else { + final int endOfCopy = unclippedReadBases.length - numEndSoftClippedBases; + final byte[] readBases = Arrays.copyOfRange(unclippedReadBases, numStartSoftClippedBases, endOfCopy); + final byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals, numStartSoftClippedBases, endOfCopy); + + int j=0; + + final byte[] contextLogGapOpenProbabilities = new byte[readBases.length]; + final byte[] contextLogGapContinuationProbabilities = new byte[readBases.length]; + + // get homopolymer length profile for current haplotype + final int[] hrunProfile = new int[readBases.length]; + getContextHomopolymerLength(readBases,hrunProfile); + fillGapProbabilities(hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities); + + // get the base insertion and deletion qualities to use + final byte[] baseInsertionQualities, baseDeletionQualities; + if ( read.hasBaseIndelQualities() ) { + baseInsertionQualities = Arrays.copyOfRange(read.getBaseInsertionQualities(), numStartSoftClippedBases, endOfCopy); + baseDeletionQualities = Arrays.copyOfRange(read.getBaseDeletionQualities(), numStartSoftClippedBases, endOfCopy); + } else { + baseInsertionQualities = contextLogGapOpenProbabilities; + baseDeletionQualities = contextLogGapOpenProbabilities; + } + + // Create a new read based on the current one, but with trimmed bases/quals, for use in the HMM + final GATKSAMRecord processedRead = GATKSAMRecord.createQualityModifiedRead(read, readBases, readQuals, baseInsertionQualities, baseDeletionQualities); + readList.add(processedRead); + + // Pack the shortened read and its associated gap-continuation-penalty array into a map, as required by PairHMM + readGCPArrayMap.put(processedRead,contextLogGapContinuationProbabilities); + + // Create a map of alleles to a new set of haplotypes, whose bases have been trimmed to the appropriate genomic locations + final Map trimmedHaplotypeMap = trimHaplotypes(haplotypeMap, startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, ref); + + // Get the likelihoods for our clipped read against each of our trimmed haplotypes. + final PerReadAlleleLikelihoodMap singleReadRawLikelihoods = pairHMM.computeLikelihoods(readList, trimmedHaplotypeMap, readGCPArrayMap); + + // Pack the original pilup element, each allele, and each associated log10 likelihood into a final map, and add each likelihood to the array + for (Allele a: trimmedHaplotypeMap.keySet()){ + double readLikelihood = singleReadRawLikelihoods.getLikelihoodAssociatedWithReadAndAllele(processedRead, a); + perReadAlleleLikelihoodMap.add(p, a, readLikelihood); + readLikelihoods[readIdx][j++] = readLikelihood; + } + // The readList for sending to the HMM should only ever contain 1 read, as each must be clipped individually + readList.remove(processedRead); + + // The same is true for the read/GCP-array map + readGCPArrayMap.remove(processedRead); + } + } + readIdx++; + } + + if (DEBUG) { + System.out.println("\nLikelihood summary"); + for (readIdx=0; readIdx < pileup.getNumberOfElements(); readIdx++) { + System.out.format("Read Index: %d ",readIdx); + for (int i=0; i < readLikelihoods[readIdx].length; i++) + System.out.format("L%d: %f ",i,readLikelihoods[readIdx][i]); + System.out.println(); + } + + } + + return readLikelihoods; + } + + private boolean useSoftClippedBases(GATKSAMRecord read, long eventStartPos, int eventLength) { + return !((read.getAlignmentStart() >= eventStartPos-eventLength && read.getAlignmentStart() <= eventStartPos+1) || (read.getAlignmentEnd() >= eventStartPos && read.getAlignmentEnd() <= eventStartPos + eventLength)); + } + +// private int computeFirstDifferingPosition(byte[] b1, byte[] b2) { +// if (b1.length != b2.length) +// return 0; // sanity check +// +// for (int i=0; i < b1.length; i++ ){ +// if ( b1[i]!= b2[i] ) +// return i; +// } +// return b1.length; +// } + + private static double[] getDiploidHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) { + final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes]; + + // todo: MAD 09/26/11 -- I'm almost certain this calculation can be simplified to just a single loop without the intermediate NxN matrix + for (int i=0; i < numHaplotypes; i++) { + for (int j=i; j < numHaplotypes; j++){ + // combine likelihoods of haplotypeLikelihoods[i], haplotypeLikelihoods[j] + // L(Hi, Hj) = sum_reads ( Pr(R|Hi)/2 + Pr(R|Hj)/2) + //readLikelihoods[k][j] has log10(Pr(R_k) | H[j] ) + for (int readIdx = 0; readIdx < readLikelihoods.length; readIdx++) { + // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2) + // First term is approximated by Jacobian log with table lookup. + if (Double.isInfinite(readLikelihoods[readIdx][i]) && Double.isInfinite(readLikelihoods[readIdx][j])) + continue; + final double li = readLikelihoods[readIdx][i]; + final double lj = readLikelihoods[readIdx][j]; + final int readCount = readCounts[readIdx]; + haplotypeLikehoodMatrix[i][j] += readCount * (MathUtils.approximateLog10SumLog10(li, lj) + MathUtils.LOG_ONE_HALF); + } + } + } + + final double[] genotypeLikelihoods = new double[numHaplotypes*(numHaplotypes+1)/2]; + int k=0; + for (int j=0; j < numHaplotypes; j++) { + for (int i=0; i <= j; i++){ + genotypeLikelihoods[k++] = haplotypeLikehoodMatrix[i][j]; + } + } + + // renormalize so that max element is zero. + return MathUtils.normalizeFromLog10(genotypeLikelihoods, false, true); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/ReadBin.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/indels/ReadBin.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/ReadBin.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/indels/ReadBin.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/AllelePair.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/AllelePair.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/AllelePair.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/AllelePair.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/BaseArray.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/BaseArray.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/BaseArray.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/BaseArray.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/CardinalityCounter.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/CardinalityCounter.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/CardinalityCounter.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/CardinalityCounter.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/CloneableIteratorLinkedList.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/CloneableIteratorLinkedList.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/CloneableIteratorLinkedList.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/CloneableIteratorLinkedList.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/DisjointSet.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/DisjointSet.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/DisjointSet.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/DisjointSet.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/Haplotype.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/Haplotype.java new file mode 100644 index 000000000..b39aa1b42 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/Haplotype.java @@ -0,0 +1,112 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.phasing; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.Arrays; +import java.util.LinkedList; +import java.util.List; +import java.util.Set; + +class Haplotype extends BaseArray implements Cloneable { + public Haplotype(byte[] bases) { + super(bases); + } + + private Haplotype(Byte[] bases) { + super(bases); + } + + public Haplotype(Haplotype other) { + super(other); + } + + public Haplotype(BaseArray baseArr) { + super(baseArr.bases); + + if (baseArr.getNonNullIndices().length != baseArr.bases.length) + throw new ReviewedStingException("Should NEVER call Haplotype ctor with null bases!"); + } + + public void updateBase(int index, Byte base) { + if (base == null) { + throw new ReviewedStingException("Internal error: CANNOT have null for a missing Haplotype base!"); + } + super.updateBase(index, base); + } + + public Haplotype clone() { + try { + super.clone(); + } catch (CloneNotSupportedException e) { + e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. + } + return new Haplotype(this); + } + + // Returns a new Haplotype containing the portion of this Haplotype between the specified fromIndex, inclusive, and toIndex, exclusive. + + public Haplotype subHaplotype(int fromIndex, int toIndex) { + return new Haplotype(Arrays.copyOfRange(bases, fromIndex, Math.min(toIndex, size()))); + } + + public Haplotype subHaplotype(Set inds) { + List basesList = new LinkedList(); + for (int i : inds) { + if (0 <= i && i < bases.length) + basesList.add(bases[i]); + } + + Byte[] newBases = new Byte[basesList.size()]; + int index = 0; + for (Byte b : basesList) + newBases[index++] = b; + + return new Haplotype(newBases); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesVCFWriter.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesVCFWriter.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesVCFWriter.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesVCFWriter.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java new file mode 100644 index 000000000..707bf2722 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -0,0 +1,998 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.phasing; + +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.ArgumentCollection; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.samples.Sample; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.vcf.*; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.variant.variantcontext.*; + +import java.io.PrintStream; +import java.util.*; + +/** + * Computes the most likely genotype combination and phases trios and parent/child pairs + * + *

+ * PhaseByTransmission is a GATK tool that 1) computes the most likely genotype combination and phases trios and parent/child pairs given their genotype likelihoods and a mutation prior and 2) phases + * all sites were parent/child transmission can be inferred unambiguously. It reports the genotype combination (and hence phasing) probability. + * Ambiguous sites are: + *

    + *
  • Sites where all individuals are heterozygous
  • + *
  • Sites where there is a Mendelian violation
  • + *
+ * Missing genotypes are handled as follows: + *
    + *
  • In parent/child pairs: If an individual genotype is missing at one site, the other one is phased if it is homozygous. No phasing probability is emitted.
  • + *
  • In trios: If the child is missing, parents are treated as separate individuals and phased if homozygous. No phasing probability is emitted.
  • + *
  • In trios: If one of the parents is missing, it is handled like a parent/child pair. Phasing is done unless both the parent and child are heterozygous and a phasing probability is emitted.
  • + *
  • In trios: If two individuals are missing, the remaining individual is phased if it is homozygous. No phasing probability is emitted.
  • + *
+ * + *

Input

+ *

+ *

    + *
  • A VCF variant set containing trio(s) and/or parent/child pair(s).
  • + *
  • A PED pedigree file containing the description of the individuals relationships.
  • + *
+ *

+ * + *

Options

+ *

+ *

    + *
  • MendelianViolationsFile: An optional argument for reporting. If a file is specified, all sites that remain in mendelian violation after being assigned the most likely genotype + * combination will be reported there. Information reported: chromosome, position, filter, allele count in VCF, family, transmission probability, + * and each individual genotype, depth, allelic depth and likelihoods.
  • + *
  • DeNovoPrior: Mutation prio; default is 1e-8
  • + *
+ *

+ * + *

Output

+ *

+ * An VCF with genotypes recalibrated as most likely under the familial constraint and phased by descent where non ambiguous.. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T PhaseByTransmission \
+ *   -V input.vcf \
+ *   -ped input.ped \
+ *   -o output.vcf
+ * 
+ * + */ +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} ) +public class PhaseByTransmission extends RodWalker, HashMap> { + + @ArgumentCollection + protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + + @Argument(shortName = "mvf",required = false,fullName = "MendelianViolationsFile", doc="File to output the mendelian violation details.") + private PrintStream mvFile = null; + + @Argument(shortName = "prior",required = false,fullName = "DeNovoPrior", doc="Prior for de novo mutations. Default: 1e-8") + private double deNovoPrior=1e-8; + + @Argument(shortName = "fatherAlleleFirst",required = false,fullName = "FatherAlleleFirst", doc="Ouputs the father allele as the first allele in phased child genotype. i.e. father|mother rather than mother|father.") + private boolean fatherFAlleleFirst=false; + + @Output + protected VariantContextWriter vcfWriter = null; + + private final String TRANSMISSION_PROBABILITY_TAG_NAME = "TP"; + private final String SOURCE_NAME = "PhaseByTransmission"; + + public final double NO_TRANSMISSION_PROB = -1.0; + + private ArrayList trios = new ArrayList(); + + //Matrix of priors for all genotype combinations + private EnumMap>> mvCountMatrix; + + //Matrix of allele transmission + private EnumMap>> transmissionMatrix; + + //Metrics counters hash keys + private final Byte NUM_TRIO_GENOTYPES_CALLED = 0; + private final Byte NUM_TRIO_GENOTYPES_NOCALL = 1; + private final Byte NUM_TRIO_GENOTYPES_PHASED = 2; + private final Byte NUM_TRIO_HET_HET_HET = 3; + private final Byte NUM_TRIO_VIOLATIONS = 4; + private final Byte NUM_TRIO_DOUBLE_VIOLATIONS = 10; + private final Byte NUM_PAIR_GENOTYPES_CALLED = 5; + private final Byte NUM_PAIR_GENOTYPES_NOCALL = 6; + private final Byte NUM_PAIR_GENOTYPES_PHASED = 7; + private final Byte NUM_PAIR_HET_HET = 8; + private final Byte NUM_PAIR_VIOLATIONS = 9; + private final Byte NUM_GENOTYPES_MODIFIED = 11; + + //Random number generator + private Random rand = new Random(); + + private enum FamilyMember { + MOTHER, + FATHER, + CHILD + } + + //Stores a conceptual trio or parent/child pair genotype combination along with its phasing. + //This combination can then be "applied" to a given trio or pair using the getPhasedGenotypes method. + private class TrioPhase { + + //Create 2 fake alleles + //The actual bases will never be used but the Genotypes created using the alleles will be. + private final Allele REF = Allele.create("A",true); + private final Allele VAR = Allele.create("A",false); + private final Allele NO_CALL = Allele.create(".",false); + private final String DUMMY_NAME = "DummySample"; + + private EnumMap trioPhasedGenotypes = new EnumMap(FamilyMember.class); + + private ArrayList getAlleles(GenotypeType genotype){ + ArrayList alleles = new ArrayList(2); + if(genotype == GenotypeType.HOM_REF){ + alleles.add(REF); + alleles.add(REF); + } + else if(genotype == GenotypeType.HET){ + alleles.add(REF); + alleles.add(VAR); + } + else if(genotype == GenotypeType.HOM_VAR){ + alleles.add(VAR); + alleles.add(VAR); + } + else{ + return null; + } + return alleles; + } + + private boolean isPhasable(GenotypeType genotype){ + return genotype == GenotypeType.HOM_REF || genotype == GenotypeType.HET || genotype == GenotypeType.HOM_VAR; + } + + //Create a new Genotype based on information from a single individual + //Homozygous genotypes will be set as phased, heterozygous won't be + private void phaseSingleIndividualAlleles(GenotypeType genotype, FamilyMember familyMember){ + boolean phase = genotype == GenotypeType.HOM_REF || genotype == GenotypeType.HOM_VAR; + trioPhasedGenotypes.put(familyMember, makeGenotype(genotype, phase)); + } + + private Genotype makeGenotype(final GenotypeType type, boolean phase) { + return makeGenotype(getAlleles(type), phase); + } + + private Genotype makeGenotype(final List alleles, boolean phase) { + final GenotypeBuilder gb = new GenotypeBuilder(DUMMY_NAME, alleles); + gb.phased(phase); + return gb.make(); + } + + //Find the phase for a parent/child pair + private void phasePairAlleles(GenotypeType parentGenotype, GenotypeType childGenotype, FamilyMember parent){ + + //Special case for Het/Het as it is ambiguous + if(parentGenotype == GenotypeType.HET && childGenotype == GenotypeType.HET){ + trioPhasedGenotypes.put(parent, makeGenotype(parentGenotype, false)); + trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(childGenotype, false)); + return; + } + + ArrayList parentAlleles = getAlleles(parentGenotype); + ArrayList childAlleles = getAlleles(childGenotype); + ArrayList parentPhasedAlleles = new ArrayList(2); + ArrayList childPhasedAlleles = new ArrayList(2); + + //If there is a possible phasing between the parent and child => phase + int childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(0)); + if(childTransmittedAlleleIndex > -1){ + trioPhasedGenotypes.put(parent, makeGenotype(parentAlleles, true)); + childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex)); + if(parent.equals(FamilyMember.MOTHER)) + childPhasedAlleles.add(childAlleles.get(0)); + else + childPhasedAlleles.add(0,childAlleles.get(0)); + trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(childPhasedAlleles, true)); + } + else if((childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(1))) > -1){ + parentPhasedAlleles.add(parentAlleles.get(1)); + parentPhasedAlleles.add(parentAlleles.get(0)); + trioPhasedGenotypes.put(parent, makeGenotype(parentPhasedAlleles, true)); + childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex)); + if(parent.equals(FamilyMember.MOTHER)) + childPhasedAlleles.add(childAlleles.get(0)); + else + childPhasedAlleles.add(0,childAlleles.get(0)); + trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(childPhasedAlleles, true)); + } + //This is a Mendelian Violation => Do not phase + else{ + trioPhasedGenotypes.put(parent, makeGenotype(parentGenotype, false)); + trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(childGenotype, false)); + } + } + + //Phases a family by transmission + private void phaseFamilyAlleles(GenotypeType mother, GenotypeType father, GenotypeType child){ + + Set> possiblePhasedChildGenotypes = new HashSet>(); + ArrayList motherAlleles = getAlleles(mother); + ArrayList fatherAlleles = getAlleles(father); + ArrayList childAlleles = getAlleles(child); + + //Build all possible child genotypes for the given parent's genotypes + for (Allele momAllele : motherAlleles) { + for (Allele fatherAllele : fatherAlleles) { + ArrayList possiblePhasedChildAlleles = new ArrayList(2); + possiblePhasedChildAlleles.add(momAllele); + possiblePhasedChildAlleles.add(fatherAllele); + possiblePhasedChildGenotypes.add(possiblePhasedChildAlleles); + } + } + + for (ArrayList childPhasedAllelesAlleles : possiblePhasedChildGenotypes) { + int firstAlleleIndex = childPhasedAllelesAlleles.indexOf(childAlleles.get(0)); + int secondAlleleIndex = childPhasedAllelesAlleles.lastIndexOf(childAlleles.get(1)); + //If a possible combination has been found, create the genotypes + if (firstAlleleIndex != secondAlleleIndex && firstAlleleIndex > -1 && secondAlleleIndex > -1) { + //Create mother's genotype + ArrayList motherPhasedAlleles = new ArrayList(2); + motherPhasedAlleles.add(childPhasedAllelesAlleles.get(0)); + if(motherAlleles.get(0) != motherPhasedAlleles.get(0)) + motherPhasedAlleles.add(motherAlleles.get(0)); + else + motherPhasedAlleles.add(motherAlleles.get(1)); + trioPhasedGenotypes.put(FamilyMember.MOTHER, makeGenotype(motherPhasedAlleles, true)); + + //Create father's genotype + ArrayList fatherPhasedAlleles = new ArrayList(2); + fatherPhasedAlleles.add(childPhasedAllelesAlleles.get(1)); + if(fatherAlleles.get(0) != fatherPhasedAlleles.get(0)) + fatherPhasedAlleles.add(fatherAlleles.get(0)); + else + fatherPhasedAlleles.add(fatherAlleles.get(1)); + trioPhasedGenotypes.put(FamilyMember.FATHER, makeGenotype(fatherPhasedAlleles,true)); + + //Create child's genotype + trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(childPhasedAllelesAlleles,true)); + + //Once a phased combination is found; exit + return; + } + } + + //If this is reached then no phasing could be found + trioPhasedGenotypes.put(FamilyMember.MOTHER, makeGenotype(mother,false)); + trioPhasedGenotypes.put(FamilyMember.FATHER, makeGenotype(father,false)); + trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(child,false)); + } + + /* Constructor: Creates a conceptual trio genotype combination from the given genotypes. + If one or more genotypes are set as NO_CALL or UNAVAILABLE, it will phase them like a pair + or single individual. + */ + public TrioPhase(GenotypeType mother, GenotypeType father, GenotypeType child){ + + //Take care of cases where one or more family members are no call + if(!isPhasable(child)){ + phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); + phaseSingleIndividualAlleles(father, FamilyMember.FATHER); + phaseSingleIndividualAlleles(child, FamilyMember.CHILD); + } + else if(!isPhasable(mother)){ + phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); + if(!isPhasable(father)){ + phaseSingleIndividualAlleles(father, FamilyMember.FATHER); + phaseSingleIndividualAlleles(child, FamilyMember.CHILD); + } + else + phasePairAlleles(father, child, FamilyMember.FATHER); + } + else if(!isPhasable(father)){ + phasePairAlleles(mother, child, FamilyMember.MOTHER); + phaseSingleIndividualAlleles(father, FamilyMember.FATHER); + } + //Special case for Het/Het/Het as it is ambiguous + else if(mother == GenotypeType.HET && father == GenotypeType.HET && child == GenotypeType.HET){ + phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); + phaseSingleIndividualAlleles(father, FamilyMember.FATHER); + phaseSingleIndividualAlleles(child, FamilyMember.CHILD); + } + //All family members have genotypes and at least one of them is not Het + else{ + phaseFamilyAlleles(mother, father, child); + } + + //If child should phased genotype should be father first, then swap the alleles + if(fatherFAlleleFirst && trioPhasedGenotypes.get(FamilyMember.CHILD).isPhased()){ + ArrayList childAlleles = new ArrayList(trioPhasedGenotypes.get(FamilyMember.CHILD).getAlleles()); + childAlleles.add(childAlleles.remove(0)); + trioPhasedGenotypes.put(FamilyMember.CHILD,makeGenotype(childAlleles,true)); + } + + } + + /** + * Applies the trio genotype combination to the given trio. + * @param ref: Reference allele + * @param alt: Alternate allele + * @param motherGenotype: Genotype of the mother to phase using this trio genotype combination + * @param fatherGenotype: Genotype of the father to phase using this trio genotype combination + * @param childGenotype: Genotype of the child to phase using this trio genotype combination + * @param transmissionProb: Probability for this trio genotype combination to be correct (pass NO_TRANSMISSION_PROB if unavailable) + * @param phasedGenotypes: An ArrayList to which the newly phased genotypes are added in the following order: Mother, Father, Child + */ + public void getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, double transmissionProb,ArrayList phasedGenotypes){ + phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.MOTHER))); + phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.FATHER))); + phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.CHILD))); + } + + private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, double transmissionProb, Genotype phasedGenotype){ + + int phredScoreTransmission = -1; + if(transmissionProb != NO_TRANSMISSION_PROB){ + double dphredScoreTransmission = QualityUtils.phredScaleLog10ErrorRate(Math.log10(1 - (transmissionProb))); + phredScoreTransmission = dphredScoreTransmission < Byte.MAX_VALUE ? (byte)dphredScoreTransmission : Byte.MAX_VALUE; + } + //Handle null, missing and unavailable genotypes + //Note that only cases where a null/missing/unavailable genotype was passed in the first place can lead to a null/missing/unavailable + //genotype so it is safe to return the original genotype in this case. + //In addition, if the phasing confidence is 0, then return the unphased, original genotypes. + if(phredScoreTransmission ==0 || genotype == null || !isPhasable(genotype.getType())) + return genotype; + + //Add the transmission probability + Map genotypeAttributes = new HashMap(); + genotypeAttributes.putAll(genotype.getExtendedAttributes()); + if(transmissionProb>NO_TRANSMISSION_PROB) + genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, phredScoreTransmission); + + ArrayList phasedAlleles = new ArrayList(2); + for(Allele allele : phasedGenotype.getAlleles()){ + if(allele.isReference()) + phasedAlleles.add(refAllele); + else if(allele.isNonReference()) + phasedAlleles.add(altAllele); + //At this point there should not be any other alleles left + else + throw new UserException(String.format("BUG: Unexpected allele: %s. Please report.",allele.toString())); + + } + + //Compute the new Log10Error if the genotype is different from the original genotype + double log10Error; + if(genotype.getType() == phasedGenotype.getType()) + log10Error = genotype.getLog10PError(); + else + log10Error = genotype.getLikelihoods().getLog10GQ(phasedGenotype.getType()); + + return new GenotypeBuilder(genotype).alleles(phasedAlleles) + .log10PError(log10Error) + .attributes(genotypeAttributes) + .phased(phasedGenotype.isPhased()).make(); + } + + + } + + /** + * Parse the familial relationship specification, build the transmission matrices and initialize VCF writer + */ + public void initialize() { + ArrayList rodNames = new ArrayList(); + rodNames.add(variantCollection.variants.getName()); + Map vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames); + Set vcfSamples = SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); + + //Get the trios from the families passed as ped + setTrios(vcfSamples); + if(trios.size()<1) + throw new UserException.BadInput("No PED file passed or no *non-skipped* trios found in PED file. Aborted."); + + + Set headerLines = new HashSet(); + headerLines.addAll(GATKVCFUtils.getHeaderFields(this.getToolkit())); + headerLines.add(new VCFFormatHeaderLine(TRANSMISSION_PROBABILITY_TAG_NAME, 1, VCFHeaderLineType.Integer, "Phred score of the genotype combination and phase given that the genotypes are correct")); + headerLines.add(new VCFHeaderLine("source", SOURCE_NAME)); + vcfWriter.writeHeader(new VCFHeader(headerLines, vcfSamples)); + + buildMatrices(); + + if(mvFile != null) + mvFile.println("CHROM\tPOS\tAC\tFAMILY\tTP\tMOTHER_GT\tMOTHER_DP\tMOTHER_AD\tMOTHER_PL\tFATHER_GT\tFATHER_DP\tFATHER_AD\tFATHER_PL\tCHILD_GT\tCHILD_DP\tCHILD_AD\tCHILD_PL"); + + } + + /** + * Select trios and parent/child pairs only + */ + private void setTrios(Set vcfSamples){ + + Map> families = this.getSampleDB().getFamilies(vcfSamples); + Set family; + ArrayList parents; + for(Map.Entry> familyEntry : families.entrySet()){ + family = familyEntry.getValue(); + + // Since getFamilies(vcfSamples) above still returns parents of samples in the VCF even if those parents are not in the VCF, need to subset down here: + Set familyMembersInVCF = new TreeSet(); + for(Sample familyMember : family){ + if (vcfSamples.contains(familyMember.getID())) { + familyMembersInVCF.add(familyMember); + } + } + family = familyMembersInVCF; + + if(family.size()<2 || family.size()>3){ + logger.info(String.format("Caution: Family %s has %d members; At the moment Phase By Transmission only supports trios and parent/child pairs. Family skipped.",familyEntry.getKey(),family.size())); + } + else{ + for(Sample familyMember : family){ + parents = familyMember.getParents(); + if(parents.size()>0){ + if(family.containsAll(parents)) + this.trios.add(familyMember); + else + logger.info(String.format("Caution: Child %s of family %s skipped as info is not provided as a complete trio nor a parent/child pair; At the moment Phase By Transmission only supports trios and parent/child pairs. Child skipped.", familyMember.getID(), familyEntry.getKey())); + } + } + } + + } + + + + } + + //Create the transmission matrices + private void buildMatrices(){ + mvCountMatrix = new EnumMap>>(GenotypeType.class); + transmissionMatrix = new EnumMap>>(GenotypeType.class); + for(GenotypeType mother : GenotypeType.values()){ + mvCountMatrix.put(mother,new EnumMap>(GenotypeType.class)); + transmissionMatrix.put(mother,new EnumMap>(GenotypeType.class)); + for(GenotypeType father : GenotypeType.values()){ + mvCountMatrix.get(mother).put(father,new EnumMap(GenotypeType.class)); + transmissionMatrix.get(mother).put(father,new EnumMap(GenotypeType.class)); + for(GenotypeType child : GenotypeType.values()){ + mvCountMatrix.get(mother).get(father).put(child, getCombinationMVCount(mother, father, child)); + transmissionMatrix.get(mother).get(father).put(child,new TrioPhase(mother,father,child)); + } + } + } + } + + //Returns the number of Mendelian Violations for a given genotype combination. + //If one of the parents genotype is missing, it will consider it as a parent/child pair + //If the child genotype or both parents genotypes are missing, 0 is returned. + private int getCombinationMVCount(GenotypeType mother, GenotypeType father, GenotypeType child){ + + //Child is no call => No MV + if(child == GenotypeType.NO_CALL || child == GenotypeType.UNAVAILABLE) + return 0; + //Add parents with genotypes for the evaluation + ArrayList parents = new ArrayList(); + if (!(mother == GenotypeType.NO_CALL || mother == GenotypeType.UNAVAILABLE)) + parents.add(mother); + if (!(father == GenotypeType.NO_CALL || father == GenotypeType.UNAVAILABLE)) + parents.add(father); + + //Both parents no calls => No MV + if (parents.isEmpty()) + return 0; + + //If at least one parent had a genotype, then count the number of ref and alt alleles that can be passed + int parentsNumRefAlleles = 0; + int parentsNumAltAlleles = 0; + + for(GenotypeType parent : parents){ + if(parent == GenotypeType.HOM_REF){ + parentsNumRefAlleles++; + } + else if(parent == GenotypeType.HET){ + parentsNumRefAlleles++; + parentsNumAltAlleles++; + } + else if(parent == GenotypeType.HOM_VAR){ + parentsNumAltAlleles++; + } + } + + //Case Child is HomRef + if(child == GenotypeType.HOM_REF){ + if(parentsNumRefAlleles == parents.size()) + return 0; + else return (parents.size()-parentsNumRefAlleles); + } + + //Case child is HomVar + if(child == GenotypeType.HOM_VAR){ + if(parentsNumAltAlleles == parents.size()) + return 0; + else return parents.size()-parentsNumAltAlleles; + } + + //Case child is Het + if(child == GenotypeType.HET && ((parentsNumRefAlleles > 0 && parentsNumAltAlleles > 0) || parents.size()<2)) + return 0; + + //MV + return 1; + } + + //Given two trio genotypes combinations, returns the number of different genotypes between the two combinations. + private int countFamilyGenotypeDiff(GenotypeType motherOriginal,GenotypeType fatherOriginal,GenotypeType childOriginal,GenotypeType motherNew,GenotypeType fatherNew,GenotypeType childNew){ + int count = 0; + if(motherOriginal!=motherNew) + count++; + if(fatherOriginal!=fatherNew) + count++; + if(childOriginal!=childNew) + count++; + return count; + } + + //Get a Map of genotype likelihoods. + //In case of null, unavailable or no call, all likelihoods are 1/3. + private EnumMap getLikelihoodsAsMapSafeNull(Genotype genotype){ + if(genotype == null || !genotype.isCalled() || genotype.getLikelihoods() == null){ + EnumMap likelihoods = new EnumMap(GenotypeType.class); + likelihoods.put(GenotypeType.HOM_REF,1.0/3.0); + likelihoods.put(GenotypeType.HET,1.0/3.0); + likelihoods.put(GenotypeType.HOM_VAR,1.0/3.0); + return likelihoods; + } + return genotype.getLikelihoods().getAsMap(true); + } + + //Returns the GenotypeType; returns UNVAILABLE if given null + private GenotypeType getTypeSafeNull(Genotype genotype){ + if(genotype == null) + return GenotypeType.UNAVAILABLE; + return genotype.getType(); + } + + + /** + * Phases the genotypes of the given trio. If one of the parents is null, it is considered a parent/child pair. + * @param ref: Reference allele + * @param alt: Alternative allele + * @param mother: Mother's genotype + * @param father: Father's genotype + * @param child: Child's genotype + * @param finalGenotypes: An ArrayList that will be added the genotypes phased by transmission in the following order: Mother, Father, Child + * @return + */ + private int phaseTrioGenotypes(Allele ref, Allele alt, Genotype mother, Genotype father, Genotype child,ArrayList finalGenotypes) { + + //Check whether it is a pair or trio + //Always assign the first parent as the parent having genotype information in pairs + //Always assign the mother as the first parent in trios + int parentsCalled = 0; + Map firstParentLikelihoods; + Map secondParentLikelihoods; + ArrayList bestFirstParentGenotype = new ArrayList(); + ArrayList bestSecondParentGenotype = new ArrayList(); + ArrayList bestChildGenotype = new ArrayList(); + GenotypeType pairSecondParentGenotype = null; + if(mother == null || !mother.isCalled()){ + firstParentLikelihoods = getLikelihoodsAsMapSafeNull(father); + secondParentLikelihoods = getLikelihoodsAsMapSafeNull(mother); + bestFirstParentGenotype.add(getTypeSafeNull(father)); + bestSecondParentGenotype.add(getTypeSafeNull(mother)); + pairSecondParentGenotype = mother == null ? GenotypeType.UNAVAILABLE : mother.getType(); + if(father != null && father.isCalled()) + parentsCalled = 1; + } + else{ + firstParentLikelihoods = getLikelihoodsAsMapSafeNull(mother); + secondParentLikelihoods = getLikelihoodsAsMapSafeNull(father); + bestFirstParentGenotype.add(getTypeSafeNull(mother)); + bestSecondParentGenotype.add(getTypeSafeNull(father)); + if(father == null || !father.isCalled()){ + parentsCalled = 1; + pairSecondParentGenotype = father == null ? GenotypeType.UNAVAILABLE : father.getType(); + }else{ + parentsCalled = 2; + } + } + Map childLikelihoods = getLikelihoodsAsMapSafeNull(child); + bestChildGenotype.add(getTypeSafeNull(child)); + + //Prior vars + double bestConfigurationLikelihood = 0.0; + double norm = 0.0; + int configuration_index =0; + ArrayList bestMVCount = new ArrayList(); + bestMVCount.add(0); + + //Get the most likely combination + //Only check for most likely combination if at least a parent and the child have genotypes + if(child.isCalled() && parentsCalled > 0){ + int mvCount; + int cumulativeMVCount = 0; + double configurationLikelihood = 0; + for(Map.Entry childGenotype : childLikelihoods.entrySet()){ + for(Map.Entry firstParentGenotype : firstParentLikelihoods.entrySet()){ + for(Map.Entry secondParentGenotype : secondParentLikelihoods.entrySet()){ + mvCount = mvCountMatrix.get(firstParentGenotype.getKey()).get(secondParentGenotype.getKey()).get(childGenotype.getKey()); + //For parent/child pairs, sum over the possible genotype configurations of the missing parent + if(parentsCalled<2){ + cumulativeMVCount += mvCount; + configurationLikelihood += mvCount>0 ? Math.pow(deNovoPrior,mvCount)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue() : (1.0-11*deNovoPrior)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue(); + } + //Evaluate configurations of trios + else{ + configurationLikelihood = mvCount>0 ? Math.pow(deNovoPrior,mvCount)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue() : (1.0-11*deNovoPrior)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue(); + norm += configurationLikelihood; + //Keep this combination if + //It has a better likelihood + //Or it has the same likelihood but requires less changes from original genotypes + if (configurationLikelihood > bestConfigurationLikelihood){ + bestConfigurationLikelihood = configurationLikelihood; + bestMVCount.clear(); + bestMVCount.add(mvCount); + bestFirstParentGenotype.clear(); + bestFirstParentGenotype.add(firstParentGenotype.getKey()); + bestSecondParentGenotype.clear(); + bestSecondParentGenotype.add(secondParentGenotype.getKey()); + bestChildGenotype.clear(); + bestChildGenotype.add(childGenotype.getKey()); + } + else if(configurationLikelihood == bestConfigurationLikelihood) { + bestFirstParentGenotype.add(firstParentGenotype.getKey()); + bestSecondParentGenotype.add(secondParentGenotype.getKey()); + bestChildGenotype.add(childGenotype.getKey()); + bestMVCount.add(mvCount); + } + } + } + //Evaluate configurations of parent/child pairs + if(parentsCalled<2){ + norm += configurationLikelihood; + //Keep this combination if + //It has a better likelihood + //Or it has the same likelihood but requires less changes from original genotypes + if (configurationLikelihood > bestConfigurationLikelihood){ + bestConfigurationLikelihood = configurationLikelihood; + bestMVCount.clear(); + bestMVCount.add(cumulativeMVCount/3); + bestChildGenotype.clear(); + bestFirstParentGenotype.clear(); + bestSecondParentGenotype.clear(); + bestChildGenotype.add(childGenotype.getKey()); + bestFirstParentGenotype.add(firstParentGenotype.getKey()); + bestSecondParentGenotype.add(pairSecondParentGenotype); + } + else if(configurationLikelihood == bestConfigurationLikelihood) { + bestFirstParentGenotype.add(firstParentGenotype.getKey()); + bestSecondParentGenotype.add(pairSecondParentGenotype); + bestChildGenotype.add(childGenotype.getKey()); + bestMVCount.add(cumulativeMVCount/3); + } + configurationLikelihood = 0; + } + } + } + + //normalize the best configuration probability + bestConfigurationLikelihood = bestConfigurationLikelihood / norm; + + //In case of multiple equally likely combinations, take a random one + if(bestFirstParentGenotype.size()>1){ + configuration_index = rand.nextInt(bestFirstParentGenotype.size()-1); + } + + } + else{ + bestConfigurationLikelihood = NO_TRANSMISSION_PROB; + } + + TrioPhase phasedTrioGenotypes; + if(parentsCalled < 2 && mother == null || !mother.isCalled()) + phasedTrioGenotypes = transmissionMatrix.get(bestSecondParentGenotype.get(configuration_index)).get(bestFirstParentGenotype.get(configuration_index)).get(bestChildGenotype.get(configuration_index)); + else + phasedTrioGenotypes = transmissionMatrix.get(bestFirstParentGenotype.get(configuration_index)).get(bestSecondParentGenotype.get(configuration_index)).get(bestChildGenotype.get(configuration_index)); + + //Return the phased genotypes + phasedTrioGenotypes.getPhasedGenotypes(ref,alt,mother,father,child,bestConfigurationLikelihood,finalGenotypes); + return bestMVCount.get(configuration_index); + + } + + + private void updatePairMetricsCounters(Genotype parent, Genotype child, int mvCount, HashMap counters){ + + //Increment metrics counters + if(parent.isCalled() && child.isCalled()){ + counters.put(NUM_PAIR_GENOTYPES_CALLED,counters.get(NUM_PAIR_GENOTYPES_CALLED)+1); + if(parent.isPhased()) + counters.put(NUM_PAIR_GENOTYPES_PHASED,counters.get(NUM_PAIR_GENOTYPES_PHASED)+1); + else{ + counters.put(NUM_PAIR_VIOLATIONS,counters.get(NUM_PAIR_VIOLATIONS)+mvCount); + if(parent.isHet() && child.isHet()) + counters.put(NUM_PAIR_HET_HET,counters.get(NUM_PAIR_HET_HET)+1); + } + }else{ + counters.put(NUM_PAIR_GENOTYPES_NOCALL,counters.get(NUM_PAIR_GENOTYPES_NOCALL)+1); + } + + } + + private void updateTrioMetricsCounters(Genotype mother, Genotype father, Genotype child, int mvCount, HashMap counters){ + + //Increment metrics counters + if(mother.isCalled() && father.isCalled() && child.isCalled()){ + counters.put(NUM_TRIO_GENOTYPES_CALLED,counters.get(NUM_TRIO_GENOTYPES_CALLED)+1); + if(mother.isPhased()) + counters.put(NUM_TRIO_GENOTYPES_PHASED,counters.get(NUM_TRIO_GENOTYPES_PHASED)+1); + + else{ + if(mvCount > 0){ + if(mvCount >1) + counters.put(NUM_TRIO_DOUBLE_VIOLATIONS,counters.get(NUM_TRIO_DOUBLE_VIOLATIONS)+1); + else + counters.put(NUM_TRIO_VIOLATIONS,counters.get(NUM_TRIO_VIOLATIONS)+1); + } + else if(mother.isHet() && father.isHet() && child.isHet()) + counters.put(NUM_TRIO_HET_HET_HET,counters.get(NUM_TRIO_HET_HET_HET)+1); + + } + }else{ + counters.put(NUM_TRIO_GENOTYPES_NOCALL,counters.get(NUM_TRIO_GENOTYPES_NOCALL)+1); + } + } + + /** + * For each variant in the file, determine the phasing for the child and replace the child's genotype with the trio's genotype + * + * @param tracker the reference meta-data tracker + * @param ref the reference context + * @param context the alignment context + * @return null + */ + @Override + public HashMap map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + + HashMap metricsCounters = new HashMap(10); + metricsCounters.put(NUM_TRIO_GENOTYPES_CALLED,0); + metricsCounters.put(NUM_TRIO_GENOTYPES_NOCALL,0); + metricsCounters.put(NUM_TRIO_GENOTYPES_PHASED,0); + metricsCounters.put(NUM_TRIO_HET_HET_HET,0); + metricsCounters.put(NUM_TRIO_VIOLATIONS,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_CALLED,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_NOCALL,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0); + metricsCounters.put(NUM_PAIR_HET_HET,0); + metricsCounters.put(NUM_PAIR_VIOLATIONS,0); + metricsCounters.put(NUM_TRIO_DOUBLE_VIOLATIONS,0); + metricsCounters.put(NUM_GENOTYPES_MODIFIED,0); + + String mvfLine; + + if (tracker == null) + return metricsCounters; + + final VariantContext vc = tracker.getFirstValue(variantCollection.variants, context.getLocation()); + if ( vc == null ) + return metricsCounters; + + if ( !vc.isBiallelic() ) { + vcfWriter.add(vc); + return metricsCounters; + } + + final VariantContextBuilder builder = new VariantContextBuilder(vc); + + final GenotypesContext genotypesContext = GenotypesContext.copy(vc.getGenotypes()); + for (Sample sample : trios) { + Genotype mother = vc.getGenotype(sample.getMaternalID()); + Genotype father = vc.getGenotype(sample.getPaternalID()); + Genotype child = vc.getGenotype(sample.getID()); + + //Keep only trios and parent/child pairs + if(mother == null && father == null || child == null) + continue; + + ArrayList trioGenotypes = new ArrayList(3); + final int mvCount = phaseTrioGenotypes(vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), mother, father, child,trioGenotypes); + + Genotype phasedMother = trioGenotypes.get(0); + Genotype phasedFather = trioGenotypes.get(1); + Genotype phasedChild = trioGenotypes.get(2); + + //Fill the genotype map with the new genotypes and increment metrics counters + genotypesContext.replace(phasedChild); + if(mother != null){ + genotypesContext.replace(phasedMother); + if(father != null){ + genotypesContext.replace(phasedFather); + updateTrioMetricsCounters(phasedMother,phasedFather,phasedChild,mvCount,metricsCounters); + mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s", + vc.getChr(),vc.getStart(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.getFamilyID(), + phasedMother.getExtendedAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getDP(),printAD(phasedMother.getAD()), + phasedMother.getLikelihoodsString(), phasedFather.getGenotypeString(),phasedFather.getDP(),printAD(phasedFather.getAD()),phasedFather.getLikelihoodsString(), + phasedChild.getGenotypeString(),phasedChild.getDP(),printAD(phasedChild.getAD()),phasedChild.getLikelihoodsString()); + if(!(phasedMother.getType()==mother.getType() && phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType())) + metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1); + } + else{ + updatePairMetricsCounters(phasedMother,phasedChild,mvCount,metricsCounters); + if(!(phasedMother.getType()==mother.getType() && phasedChild.getType()==child.getType())) + metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1); + mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s:%s:%s:%s\t.\t.\t.\t.\t%s\t%s\t%s\t%s", + vc.getChr(),vc.getStart(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.getFamilyID(), + phasedMother.getExtendedAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getDP(),printAD(phasedMother.getAD()),phasedMother.getLikelihoodsString(), + phasedChild.getGenotypeString(),phasedChild.getDP(),printAD(phasedChild.getAD()),phasedChild.getLikelihoodsString()); + } + } + else{ + genotypesContext.replace(phasedFather); + updatePairMetricsCounters(phasedFather,phasedChild,mvCount,metricsCounters); + if(!(phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType())) + metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1); + mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t.\t.\t.\t.\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s", + vc.getChr(),vc.getStart(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.getFamilyID(), + phasedFather.getExtendedAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedFather.getGenotypeString(),phasedFather.getDP(),printAD(phasedFather.getAD()),phasedFather.getLikelihoodsString(), + phasedChild.getGenotypeString(),phasedChild.getDP(),printAD(phasedChild.getAD()),phasedChild.getLikelihoodsString()); + } + + //Report violation if set so + //TODO: ADAPT FOR PAIRS TOO!! + if(mvCount>0 && mvFile != null && !vc.isFiltered()) + mvFile.println(mvfLine); + } + + builder.genotypes(genotypesContext); + vcfWriter.add(builder.make()); + + return metricsCounters; + } + + private static String printAD(final int[] AD) { + if ( AD == null || AD.length == 0 ) + return "."; + final StringBuilder sb = new StringBuilder(); + sb.append(AD[0]); + for ( int i = 1; i < AD.length; i++) { + sb.append(","); + sb.append(AD[i]); + } + return sb.toString(); + } + + /** + * Initializes the reporting counters. + * + * @return All counters initialized to 0 + */ + @Override + public HashMap reduceInit() { + HashMap metricsCounters = new HashMap(10); + metricsCounters.put(NUM_TRIO_GENOTYPES_CALLED,0); + metricsCounters.put(NUM_TRIO_GENOTYPES_NOCALL,0); + metricsCounters.put(NUM_TRIO_GENOTYPES_PHASED,0); + metricsCounters.put(NUM_TRIO_HET_HET_HET,0); + metricsCounters.put(NUM_TRIO_VIOLATIONS,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_CALLED,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_NOCALL,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0); + metricsCounters.put(NUM_PAIR_HET_HET,0); + metricsCounters.put(NUM_PAIR_VIOLATIONS,0); + metricsCounters.put(NUM_TRIO_DOUBLE_VIOLATIONS,0); + metricsCounters.put(NUM_GENOTYPES_MODIFIED,0); + + return metricsCounters; + } + + /** + * Adds the value of the site phased to the reporting counters. + * + * @param value Site values + * @param sum accumulator for the reporting counters + * @return accumulator with result of the map taken into account. + */ + @Override + public HashMap reduce(HashMap value, HashMap sum) { + sum.put(NUM_TRIO_GENOTYPES_CALLED,value.get(NUM_TRIO_GENOTYPES_CALLED)+sum.get(NUM_TRIO_GENOTYPES_CALLED)); + sum.put(NUM_TRIO_GENOTYPES_NOCALL,value.get(NUM_TRIO_GENOTYPES_NOCALL)+sum.get(NUM_TRIO_GENOTYPES_NOCALL)); + sum.put(NUM_TRIO_GENOTYPES_PHASED,value.get(NUM_TRIO_GENOTYPES_PHASED)+sum.get(NUM_TRIO_GENOTYPES_PHASED)); + sum.put(NUM_TRIO_HET_HET_HET,value.get(NUM_TRIO_HET_HET_HET)+sum.get(NUM_TRIO_HET_HET_HET)); + sum.put(NUM_TRIO_VIOLATIONS,value.get(NUM_TRIO_VIOLATIONS)+sum.get(NUM_TRIO_VIOLATIONS)); + sum.put(NUM_PAIR_GENOTYPES_CALLED,value.get(NUM_PAIR_GENOTYPES_CALLED)+sum.get(NUM_PAIR_GENOTYPES_CALLED)); + sum.put(NUM_PAIR_GENOTYPES_NOCALL,value.get(NUM_PAIR_GENOTYPES_NOCALL)+sum.get(NUM_PAIR_GENOTYPES_NOCALL)); + sum.put(NUM_PAIR_GENOTYPES_PHASED,value.get(NUM_PAIR_GENOTYPES_PHASED)+sum.get(NUM_PAIR_GENOTYPES_PHASED)); + sum.put(NUM_PAIR_HET_HET,value.get(NUM_PAIR_HET_HET)+sum.get(NUM_PAIR_HET_HET)); + sum.put(NUM_PAIR_VIOLATIONS,value.get(NUM_PAIR_VIOLATIONS)+sum.get(NUM_PAIR_VIOLATIONS)); + sum.put(NUM_TRIO_DOUBLE_VIOLATIONS,value.get(NUM_TRIO_DOUBLE_VIOLATIONS)+sum.get(NUM_TRIO_DOUBLE_VIOLATIONS)); + sum.put(NUM_GENOTYPES_MODIFIED,value.get(NUM_GENOTYPES_MODIFIED)+sum.get(NUM_GENOTYPES_MODIFIED)); + + return sum; + } + + + /** + * Reports statistics on the phasing by transmission process. + * @param result Accumulator with all counters. + */ + @Override + public void onTraversalDone(HashMap result) { + logger.info("Number of complete trio-genotypes: " + result.get(NUM_TRIO_GENOTYPES_CALLED)); + logger.info("Number of trio-genotypes containing no call(s): " + result.get(NUM_TRIO_GENOTYPES_NOCALL)); + logger.info("Number of trio-genotypes phased: " + result.get(NUM_TRIO_GENOTYPES_PHASED)); + logger.info("Number of resulting Het/Het/Het trios: " + result.get(NUM_TRIO_HET_HET_HET)); + logger.info("Number of remaining single mendelian violations in trios: " + result.get(NUM_TRIO_VIOLATIONS)); + logger.info("Number of remaining double mendelian violations in trios: " + result.get(NUM_TRIO_DOUBLE_VIOLATIONS)); + logger.info("Number of complete pair-genotypes: " + result.get(NUM_PAIR_GENOTYPES_CALLED)); + logger.info("Number of pair-genotypes containing no call(s): " + result.get(NUM_PAIR_GENOTYPES_NOCALL)); + logger.info("Number of pair-genotypes phased: " + result.get(NUM_PAIR_GENOTYPES_PHASED)); + logger.info("Number of resulting Het/Het pairs: " + result.get(NUM_PAIR_HET_HET)); + logger.info("Number of remaining mendelian violations in pairs: " + result.get(NUM_PAIR_VIOLATIONS)); + logger.info("Number of genotypes updated: " + result.get(NUM_GENOTYPES_MODIFIED)); + + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingGraph.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/PhasingGraph.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingGraph.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/PhasingGraph.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingGraphEdge.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/PhasingGraphEdge.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingGraphEdge.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/PhasingGraphEdge.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingUtils.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/PhasingUtils.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingUtils.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/PhasingUtils.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PreciseNonNegativeDouble.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/PreciseNonNegativeDouble.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PreciseNonNegativeDouble.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/PreciseNonNegativeDouble.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java new file mode 100644 index 000000000..7ed77b845 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java @@ -0,0 +1,1870 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.phasing; + +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.ArgumentCollection; +import org.broadinstitute.sting.commandline.Hidden; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.HasGenomeLocation; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.variant.vcf.*; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.variant.variantcontext.*; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriterFactory; + +import java.io.*; +import java.util.*; + +import static org.broadinstitute.sting.utils.variant.GATKVCFUtils.getVCFHeadersFromRods; + +/** + * Walks along all variant ROD loci, caching a user-defined window of VariantContext sites, and then finishes phasing them when they go out of range (using upstream and downstream reads). + * + * The current implementation works for diploid SNPs, and will transparently (but properly) ignore other sites. + * + * The underlying algorithm is based on building up 2^n local haplotypes, + * where n is the number of heterozygous SNPs in the local region we expected to find phase-informative reads (and assumes a maximum value of maxPhaseSites, a user parameter). + * Then, these 2^n haplotypes are used to determine, with sufficient certainty (the assigned PQ score), to which haplotype the alleles of a genotype at a particular locus belong (denoted by the HP tag). + * + *

+ * Performs physical phasing of SNP calls, based on sequencing reads. + *

+ * + *

Input

+ *

+ * VCF file of SNP calls, BAM file of sequence reads. + *

+ * + *

Output

+ *

+ * Phased VCF file. + *

+ * + *

Examples

+ *
+ *    java
+ *      -jar GenomeAnalysisTK.jar
+ *      -T ReadBackedPhasing
+ *      -R reference.fasta
+ *      -I reads.bam
+ *      --variant SNPs.vcf
+ *      -L SNPs.vcf
+ *      -o phased_SNPs.vcf
+ *      --phaseQualityThresh 20.0
+ * 
+ * + * @author Menachem Fromer + * @since July 2010 + */ +@Allows(value = {DataSource.READS, DataSource.REFERENCE}) +@Requires(value = {DataSource.READS, DataSource.REFERENCE}) +@By(DataSource.READS) + +// Filter out all reads with zero mapping quality +@ReadFilters({MappingQualityZeroFilter.class}) + +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} ) +public class ReadBackedPhasing extends RodWalker { + @Argument(fullName="debug", shortName="debug", doc="If specified, print out very verbose debug information (if -l DEBUG is also specified)", required = false) + protected boolean DEBUG = false; + /** + * The VCF file we are phasing variants from. + * + * All heterozygous variants found in this VCF file will be phased, where possible + */ + @ArgumentCollection + protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + + @Output(doc = "File to which variants should be written") + protected VariantContextWriter writer = null; + + @Argument(fullName = "cacheWindowSize", shortName = "cacheWindow", doc = "The window size (in bases) to cache variant sites and their reads for the phasing procedure", required = false) + protected Integer cacheWindow = 20000; + + @Argument(fullName = "maxPhaseSites", shortName = "maxSites", doc = "The maximum number of successive heterozygous sites permitted to be used by the phasing algorithm", required = false) + protected Integer maxPhaseSites = 10; // 2^10 == 10^3 diploid haplotypes + + @Argument(fullName = "phaseQualityThresh", shortName = "phaseThresh", doc = "The minimum phasing quality score required to output phasing", required = false) + protected Double phaseQualityThresh = 20.0; // PQ = 20.0 <=> P(error) = 10^(-20/10) = 0.01, P(correct) = 0.99 + + @Hidden + @Argument(fullName = "variantStatsFilePrefix", shortName = "variantStats", doc = "The prefix of the VCF/phasing statistics files [For DEBUGGING purposes only - DO NOT USE!]", required = false) + protected String variantStatsFilePrefix = null; + private PhasingQualityStatsWriter statsWriter = null; + + @Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for phasing", required = false) + public int MIN_BASE_QUALITY_SCORE = 17; + + @Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for phasing", required = false) + public int MIN_MAPPING_QUALITY_SCORE = 20; + + @Argument(fullName = "sampleToPhase", shortName = "sampleToPhase", doc = "Only include these samples when phasing", required = false) + protected Set samplesToPhase = null; + + @Hidden + @Argument(fullName = "permitNoSampleOverlap", shortName = "permitNoSampleOverlap", doc = "Don't exit (just WARN) when the VCF and BAMs do not overlap in samples", required = false) + private boolean permitNoSampleOverlap = false; + + private GenomeLoc mostDownstreamLocusReached = null; + + private LinkedList unphasedSiteQueue = null; + private CloneableIteratorLinkedList partiallyPhasedSites = null; // the phased VCs to be emitted, and the alignment bases at these positions + + private static PreciseNonNegativeDouble ZERO = new PreciseNonNegativeDouble(0.0); + + public static final String PQ_KEY = "PQ"; + public static final String HP_KEY = "HP"; + + // In order to detect phase inconsistencies: + private static final double FRACTION_OF_MEAN_PQ_CHANGES = 0.1; // If the PQ decreases by this fraction of the mean PQ changes (thus far), then this read is inconsistent with previous reads + private static final double MAX_FRACTION_OF_INCONSISTENT_READS = 0.1; // If there are more than this fraction of inconsistent reads, then flag this site + + public static final String PHASING_INCONSISTENT_KEY = "PhasingInconsistent"; + + @Argument(fullName = "enableMergePhasedSegregatingPolymorphismsToMNP", shortName = "enableMergeToMNP", doc = "Merge consecutive phased sites into MNP records", required = false) + protected boolean enableMergePhasedSegregatingPolymorphismsToMNP = false; + + @Argument(fullName = "maxGenomicDistanceForMNP", shortName = "maxDistMNP", doc = "The maximum reference-genome distance between consecutive heterozygous sites to permit merging phased VCF records into a MNP record", required = false) + protected int maxGenomicDistanceForMNP = 1; + + @Hidden + @Argument(fullName = "outputMultipleBaseCountsFile", shortName = "outputMultipleBaseCountsFile", doc = "File to output cases where a single read has multiple bases at the same position [For DEBUGGING purposes only - DO NOT USE!]", required = false) + protected File outputMultipleBaseCountsFile = null; + private MultipleBaseCountsWriter outputMultipleBaseCountsWriter = null; + + public void initialize() { + if (maxPhaseSites <= 2) + maxPhaseSites = 2; // by definition, must phase a site relative to previous site [thus, 2 in total] + + /* + Since we cap each base quality (BQ) by its read's mapping quality (MQ) [in Read.updateBaseAndQuality()], then: + if minBQ > minMQ, then we require that MQ be >= minBQ as well. + [Otherwise, we end up capping BQ by MQ only AFTER we tried removing bases with BQ < minBQ, which is WRONG!] + + To do this properly, we set: minMQ = max(minMQ, minBQ) + */ + MIN_MAPPING_QUALITY_SCORE = Math.max(MIN_MAPPING_QUALITY_SCORE, MIN_BASE_QUALITY_SCORE); + + unphasedSiteQueue = new LinkedList(); + partiallyPhasedSites = new CloneableIteratorLinkedList(); + + initializeVcfWriter(); + + if (variantStatsFilePrefix != null) + statsWriter = new PhasingQualityStatsWriter(variantStatsFilePrefix); + + if (outputMultipleBaseCountsFile != null) + outputMultipleBaseCountsWriter = new MultipleBaseCountsWriter(outputMultipleBaseCountsFile); + } + + private void initializeVcfWriter() { + // Wrapper VCFWriters will take ownership of inner writers iff: inner writer != origWriter [which wasn't created here] + VariantContextWriter origWriter = writer; + + if (enableMergePhasedSegregatingPolymorphismsToMNP) + writer = new MergeSegregatingAlternateAllelesVCFWriter(writer, getToolkit().getGenomeLocParser(), getToolkit().getArguments().referenceFile, maxGenomicDistanceForMNP, logger, writer != origWriter); + + /* Due to discardIrrelevantPhasedSites(), the startDistance spanned by [partiallyPhasedSites.peek(), unphasedSiteQueue.peek()] is <= cacheWindow + Due to processQueue(), the startDistance spanned by [unphasedSiteQueue.peek(), mostDownstreamLocusReached] is <= cacheWindow + Hence, the startDistance between: partiallyPhasedSites.peek() --> mostDownstreamLocusReached is <= 2 * cacheWindow + + Therefore, can write the filtered records located at mostDownstreamLocusReached (if any) to SortingVCFWriter, even though partiallyPhasedSites.peek() has not yet been written. + + But, NOTE that map() is careful to pass out a list of records to be written that FIRST includes any records discarded due to having reached mostDownstreamLocusReached, + and only THEN records located at mostDownstreamLocusReached. The opposite order in map() would violate the startDistance limits imposed when contracting SortingVCFWriter with (2 * cacheWindow). + */ + writer = VariantContextWriterFactory.sortOnTheFly(writer, 2 * cacheWindow, writer != origWriter); + + // setup the header fields: + Set hInfo = new HashSet(); + hInfo.addAll(GATKVCFUtils.getHeaderFields(getToolkit())); + hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName())); + + // Phasing-specific INFO fields: + hInfo.add(new VCFFormatHeaderLine(PQ_KEY, 1, VCFHeaderLineType.Float, "Read-backed phasing quality")); + hInfo.add(new VCFFormatHeaderLine(HP_KEY, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Read-backed phasing haplotype identifiers")); + hInfo.add(new VCFInfoHeaderLine(PHASING_INCONSISTENT_KEY, 0, VCFHeaderLineType.Flag, "Are the reads significantly haplotype-inconsistent?")); + + // todo -- fix samplesToPhase + String trackName = variantCollection.variants.getName(); + Map rodNameToHeader = getVCFHeadersFromRods(getToolkit(), Arrays.asList(trackName)); + Set vcfSamples = new TreeSet(samplesToPhase == null ? rodNameToHeader.get(trackName).getGenotypeSamples() : samplesToPhase); + writer.writeHeader(new VCFHeader(hInfo, vcfSamples)); + + Set readSamples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); + readSamples.retainAll(vcfSamples); + if (readSamples.isEmpty()) { + String noPhaseString = "No common samples in VCF and BAM headers" + (samplesToPhase == null ? "" : " (limited to sampleToPhase parameters)") + ", so nothing could possibly be phased!"; + if (permitNoSampleOverlap) + logger.warn(noPhaseString); + else + throw new UserException(noPhaseString); + } + } + + public PhasingStats reduceInit() { + return new PhasingStats(); + } + + /** + * For each site of interest, cache the current site and then use the cache to phase all sites + * for which "sufficient" information has already been observed. + * + * @param tracker the meta-data tracker + * @param ref the reference base + * @param context the context for the given locus + * @return statistics of and list of all phased VariantContexts and their base pileup that have gone out of cacheWindow range. + */ + public PhasingStatsAndOutput map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if (tracker == null) + return null; + + mostDownstreamLocusReached = ref.getLocus(); + if (DEBUG) logger.debug("map() at: " + mostDownstreamLocusReached); + + PhasingStats phaseStats = new PhasingStats(); + List unprocessedList = new LinkedList(); + + for (VariantContext vc : tracker.getValues(variantCollection.variants, context.getLocation())) { + if (samplesToPhase != null) vc = reduceVCToSamples(vc, samplesToPhase); + + if (ReadBackedPhasing.processVariantInPhasing(vc)) { + VariantAndReads vr = new VariantAndReads(vc, context); + unphasedSiteQueue.add(vr); + + if (DEBUG) + logger.debug("Added variant to queue = " + GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vr.variant)); + } + else { + unprocessedList.add(vc); // Finished with the unprocessed variant, and writer can enforce sorting on-the-fly + + if (DEBUG) + logger.debug("Unprocessed variant = " + GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc)); + } + + int numReads = context.getBasePileup().getNumberOfElements(); + PhasingStats addInPhaseStats = new PhasingStats(numReads, 1); + phaseStats.addIn(addInPhaseStats); + } + + List completedList = processQueue(phaseStats, false); + completedList.addAll(unprocessedList); // add unprocessedList on to the END of completedList so that the processQueue() results, which are necessarily more upstream, are first! + + return new PhasingStatsAndOutput(phaseStats, completedList); + } + + private static final Set KEYS_TO_KEEP_IN_REDUCED_VCF = new HashSet(Arrays.asList(PQ_KEY)); + + private VariantContext reduceVCToSamples(VariantContext vc, Set samplesToPhase) { +// for ( String sample : samplesToPhase ) +// logger.debug(String.format(" Sample %s has genotype %s, het = %s", sample, vc.getGenotype(sample), vc.getGenotype(sample).isHet() )); + VariantContext subvc = vc.subContextFromSamples(samplesToPhase); +// logger.debug("original VC = " + vc); +// logger.debug("sub VC = " + subvc); + return GATKVariantContextUtils.pruneVariantContext(subvc, KEYS_TO_KEEP_IN_REDUCED_VCF); + } + + // Phase all "waiting" genotypes in the unphasedSiteQueue, but only if we have sufficient downstream genotypes with which to phase them + private List processQueue(PhasingStats phaseStats, boolean processAll) { + List oldPhasedList = new LinkedList(); + + while (!unphasedSiteQueue.isEmpty()) { + if (!processAll) { // otherwise, phase until the end of unphasedSiteQueue + VariantContext nextToPhaseVc = unphasedSiteQueue.peek().variant; + if (startDistancesAreInWindowRange(mostDownstreamLocusReached, GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), nextToPhaseVc))) { + /* mostDownstreamLocusReached is still not far enough ahead of nextToPhaseVc to have all phasing information for nextToPhaseVc + (note that we ASSUME that the VCF is ordered by ). + Note that this will always leave at least one entry (the last one), since mostDownstreamLocusReached is in range of itself. + */ + break; + } + // Already saw all variant positions within cacheWindow startDistance ahead of vc (on its contig) + } + // Update partiallyPhasedSites before it's used in phaseSite: + oldPhasedList.addAll(discardIrrelevantPhasedSites()); + if (DEBUG) logger.debug("oldPhasedList(1st) = " + toStringVCL(oldPhasedList)); + + VariantAndReads vr = unphasedSiteQueue.remove(); + if (DEBUG) + logger.debug("Performing phasing for " + GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vr.variant)); + phaseSite(vr, phaseStats); + } + + // Update partiallyPhasedSites after phaseSite is done: + oldPhasedList.addAll(discardIrrelevantPhasedSites()); + if (DEBUG) logger.debug("oldPhasedList(2nd) = " + toStringVCL(oldPhasedList)); + + if (outputMultipleBaseCountsWriter != null) + outputMultipleBaseCountsWriter.outputMultipleBaseCounts(); + + return oldPhasedList; + } + + // Flush out sites with (possibly) phased genotypes, if those sites are no longer needed to phase other downstream sites + private List discardIrrelevantPhasedSites() { + List vcList = new LinkedList(); + + GenomeLoc nextToPhaseLoc = null; + if (!unphasedSiteQueue.isEmpty()) + nextToPhaseLoc = GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), unphasedSiteQueue.peek().variant); + + while (!partiallyPhasedSites.isEmpty()) { + if (nextToPhaseLoc != null) { // otherwise, unphasedSiteQueue.isEmpty(), and therefore no need to keep any of the "past" + UnfinishedVariantAndReads partPhasedVr = partiallyPhasedSites.peek(); + + if (startDistancesAreInWindowRange(partPhasedVr.unfinishedVariant.getLocation(), nextToPhaseLoc)) + // nextToPhaseLoc is still not far enough ahead of partPhasedVr to exclude partPhasedVr from calculations + break; + } + UnfinishedVariantAndReads uvr = partiallyPhasedSites.remove(); + vcList.add(uvr.unfinishedVariant.toVariantContext()); + } + + return vcList; + } + + /* Phase vc (removed head of unphasedSiteQueue) using all VariantContext objects in + partiallyPhasedSites, and all in unphasedSiteQueue that are within cacheWindow startDistance ahead of vc (on its contig). + + ASSUMES: All VariantContexts in unphasedSiteQueue are in positions downstream of vc (head of queue). + */ + + private void phaseSite(VariantAndReads vr, PhasingStats phaseStats) { + VariantContext vc = vr.variant; + logger.debug("Will phase vc = " + GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc)); + + UnfinishedVariantAndReads uvr = new UnfinishedVariantAndReads(vr); + UnfinishedVariantContext uvc = uvr.unfinishedVariant; + + // Perform per-sample phasing: + GenotypesContext sampGenotypes = vc.getGenotypes(); + Map samplePhaseStats = new TreeMap(); + for (final Genotype gt : sampGenotypes) { + String samp = gt.getSampleName(); + + if (DEBUG) logger.debug("sample = " + samp); + if (isUnfilteredCalledDiploidGenotype(gt)) { + if (gt.isHet()) { // Attempt to phase this het genotype relative to *SOME* previous het genotype: + + // Create the list of all het genotypes preceding this one (and in the phasing window as contained in partiallyPhasedSites): + List prevHetGenotypes = new LinkedList(); + CloneableIteratorLinkedList.CloneableIterator phasedIt = partiallyPhasedSites.iterator(); + while (phasedIt.hasNext()) { + UnfinishedVariantAndReads phasedVr = phasedIt.next(); + Genotype prevGt = phasedVr.unfinishedVariant.getGenotype(samp); + if (prevGt != null && isUnfilteredCalledDiploidGenotype(prevGt) && prevGt.isHet()) { + GenotypeAndReadBases grb = new GenotypeAndReadBases(prevGt, phasedVr.sampleReadBases.get(samp), phasedVr.unfinishedVariant.getLocation()); + prevHetGenotypes.add(grb); + if (DEBUG) logger.debug("Using UPSTREAM het site = " + grb.loc); + } + } + + SNPallelePair allelePair = new SNPallelePair(gt); + if (DEBUG) logger.debug("Want to phase TOP vs. BOTTOM for: " + "\n" + allelePair); + + boolean phasedCurGenotypeRelativeToPrevious = false; + for (int goBackFromEndOfPrevHets = 0; goBackFromEndOfPrevHets < prevHetGenotypes.size(); goBackFromEndOfPrevHets++) { + PhasingWindow phaseWindow = new PhasingWindow(vr, samp, prevHetGenotypes, goBackFromEndOfPrevHets); + + PhaseResult pr = phaseSampleAtSite(phaseWindow); + phasedCurGenotypeRelativeToPrevious = passesPhasingThreshold(pr.phaseQuality); + + if (pr.phasingContainsInconsistencies) { + if (DEBUG) + logger.debug("MORE than " + (MAX_FRACTION_OF_INCONSISTENT_READS * 100) + "% of the reads are inconsistent for phasing of " + GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc)); + uvc.setPhasingInconsistent(); + } + + if (phasedCurGenotypeRelativeToPrevious) { + Genotype prevHetGenotype = phaseWindow.phaseRelativeToGenotype(); + SNPallelePair prevAllelePair = new SNPallelePair(prevHetGenotype); + if (!prevHetGenotype.hasAnyAttribute(HP_KEY)) + throw new ReviewedStingException("Internal error: missing haplotype markings for previous genotype, even though we put it there..."); + String[] prevPairNames = (String[]) prevHetGenotype.getAnyAttribute(HP_KEY); + + String[] curPairNames = ensurePhasing(allelePair, prevAllelePair, prevPairNames, pr.haplotype); + Genotype phasedGt = new GenotypeBuilder(gt) + .alleles(allelePair.getAllelesAsList()) + .attribute(PQ_KEY, pr.phaseQuality) + .attribute(HP_KEY, curPairNames) + .make(); + uvc.setGenotype(samp, phasedGt); + + if (DEBUG) { + logger.debug("PREVIOUS CHROMOSOME NAMES: Top= " + prevPairNames[0] + ", Bot= " + prevPairNames[1]); + logger.debug("PREVIOUS CHROMOSOMES:\n" + prevAllelePair + "\n"); + + logger.debug("CURRENT CHROMOSOME NAMES: Top= " + curPairNames[0] + ", Bot= " + curPairNames[1]); + logger.debug("CURRENT CHROMOSOMES:\n" + allelePair + "\n"); + logger.debug("\n"); + } + } + + if (statsWriter != null) { + GenomeLoc prevLoc = null; + int curIndex = 0; + for (GenotypeAndReadBases grb : prevHetGenotypes) { + if (curIndex == prevHetGenotypes.size() - 1 - goBackFromEndOfPrevHets) { + prevLoc = grb.loc; + break; + } + ++curIndex; + } + statsWriter.addStat(samp, GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc), startDistance(prevLoc, vc), pr.phaseQuality, phaseWindow.readsAtHetSites.size(), phaseWindow.hetGenotypes.length); + } + + PhaseCounts sampPhaseCounts = samplePhaseStats.get(samp); + if (sampPhaseCounts == null) { + sampPhaseCounts = new PhaseCounts(); + samplePhaseStats.put(samp, sampPhaseCounts); + } + sampPhaseCounts.numTestedSites++; + + if (pr.phasingContainsInconsistencies) { + if (phasedCurGenotypeRelativeToPrevious) + sampPhaseCounts.numInconsistentSitesPhased++; + else + sampPhaseCounts.numInconsistentSitesNotPhased++; + } + + if (phasedCurGenotypeRelativeToPrevious) + sampPhaseCounts.numPhased++; + + // Phased current relative to *SOME* previous het genotype, so break out of loop: + if (phasedCurGenotypeRelativeToPrevious) + break; + } + + if (!phasedCurGenotypeRelativeToPrevious) { // Either no previous hets, or unable to phase relative to any previous het: + String locStr = Integer.toString(GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc).getStart()); + + Genotype startNewHaplotypeGt = new GenotypeBuilder(gt) + .attribute(HP_KEY, new String[]{locStr + "-1", locStr + "-2"}) + .make(); + + uvc.setGenotype(samp, startNewHaplotypeGt); + } + } + } + } + + partiallyPhasedSites.add(uvr); // only add it in now, since don't want it to be there during phasing + phaseStats.addIn(new PhasingStats(samplePhaseStats)); + } + + public boolean passesPhasingThreshold(double PQ) { + return PQ >= phaseQualityThresh; + } + + // A genotype and the base pileup that supports it + private static class GenotypeAndReadBases { + public Genotype genotype; + public ReadBasesAtPosition readBases; + public GenomeLoc loc; + + public GenotypeAndReadBases(Genotype genotype, ReadBasesAtPosition readBases, GenomeLoc loc) { + this.genotype = genotype; + this.readBases = readBases; + this.loc = loc; + } + } + + // Object to represent the local window of het genotypes for which haplotypes are being scored and ranked + private class PhasingWindow { + private Genotype[] hetGenotypes = null; + + private int phaseRelativeToIndex = -1; + private int phasingSiteIndex = -1; + + private Map readsAtHetSites = null; + + public Genotype phaseRelativeToGenotype() { + return hetGenotypes[phaseRelativeToIndex]; + } + + // ASSUMES that: isUnfilteredCalledDiploidGenotype(vrGt) && vrGt.isHet() [vrGt = vr.variant.getGenotype(sample)] + + public PhasingWindow(VariantAndReads vr, String sample, List prevHetGenotypes, int goBackFromEndOfPrevHets) { + if (prevHetGenotypes.isEmpty() || goBackFromEndOfPrevHets >= prevHetGenotypes.size()) // no previous sites against which to phase + throw new ReviewedStingException("Should never get empty set of previous sites to phase against"); + + // Include these previously phased sites in the phasing computation: + List listHetGenotypes = new LinkedList(prevHetGenotypes); + + phaseRelativeToIndex = listHetGenotypes.size() - 1 - goBackFromEndOfPrevHets; + phasingSiteIndex = listHetGenotypes.size(); + + // Add the (het) position to be phased [at phasingSiteIndex]: + GenomeLoc phaseLocus = GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vr.variant); + GenotypeAndReadBases grbPhase = new GenotypeAndReadBases(vr.variant.getGenotype(sample), vr.sampleReadBases.get(sample), phaseLocus); + listHetGenotypes.add(grbPhase); + if (DEBUG) logger.debug("PHASING het site = " + grbPhase.loc + " [phasingSiteIndex = " + phasingSiteIndex + "]"); + + // Include as-of-yet unphased sites in the phasing computation: + for (VariantAndReads nextVr : unphasedSiteQueue) { + if (!startDistancesAreInWindowRange(vr.variant, nextVr.variant)) //nextVr too far ahead of the range used for phasing vc + break; + Genotype gt = nextVr.variant.getGenotype(sample); + if (gt != null && isUnfilteredCalledDiploidGenotype(gt) && gt.isHet()) { + GenotypeAndReadBases grb = new GenotypeAndReadBases(gt, nextVr.sampleReadBases.get(sample), GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), nextVr.variant)); + listHetGenotypes.add(grb); + if (DEBUG) logger.debug("Using DOWNSTREAM het site = " + grb.loc); + } + } + + // First, assemble the "sub-reads" from the COMPLETE WINDOW-BASED SET of heterozygous positions for this sample: + buildReadsAtHetSites(listHetGenotypes, sample, grbPhase.loc); + + // Remove extraneous reads (those that do not "connect" the two core phasing sites): + Set onlyKeepReads = removeExtraneousReads(listHetGenotypes.size()); + + // Dynamically modify the window to only include sites which have a non-empty set of reads: + listHetGenotypes = removeExtraneousSites(listHetGenotypes); + + // In any case, must still trim the window size to be "feasible" + // [**NOTE**: May want to do this to try maximize the preservation of paths from phaseRelativeToIndex to phasingSiteIndex]: + if (listHetGenotypes.size() > maxPhaseSites) { + listHetGenotypes = trimWindow(listHetGenotypes, sample, phaseLocus); + + // Can now remove any extra reads (and then sites): + buildReadsAtHetSites(listHetGenotypes, onlyKeepReads); + onlyKeepReads = removeExtraneousReads(listHetGenotypes.size()); + listHetGenotypes = removeExtraneousSites(listHetGenotypes); + } + + // Lastly, assemble the "sub-reads" from the FINAL SET of heterozygous positions for this sample: + buildReadsAtHetSites(listHetGenotypes, onlyKeepReads); + + // Copy to a fixed-size array: + if (DEBUG) logger.debug("FINAL phasing window of " + listHetGenotypes.size() + " sites:\n" + toStringGRL(listHetGenotypes)); + hetGenotypes = new Genotype[listHetGenotypes.size()]; + int index = 0; + for (GenotypeAndReadBases copyGrb : listHetGenotypes) + hetGenotypes[index++] = copyGrb.genotype; + } + + // Build the read sub-sequences at the het genomic positions: + private void buildReadsAtHetSites(List listHetGenotypes, String sample, GenomeLoc phasingLoc) { + buildReadsAtHetSites(listHetGenotypes, sample, phasingLoc, null); + } + + private void buildReadsAtHetSites(List listHetGenotypes, Set onlyKeepReads) { + buildReadsAtHetSites(listHetGenotypes, null, null, onlyKeepReads); + } + + private void buildReadsAtHetSites(List listHetGenotypes, String sample, GenomeLoc phasingLoc, Set onlyKeepReads) { + readsAtHetSites = new HashMap(); + + int index = 0; + for (GenotypeAndReadBases grb : listHetGenotypes) { + ReadBasesAtPosition readBases = grb.readBases; + if (readBases != null) { + for (ReadBase rb : readBases) { + String readName = rb.readName; + if (onlyKeepReads != null && !onlyKeepReads.contains(readName)) // if onlyKeepReads exists, ignore reads not in onlyKeepReads + continue; + + PhasingRead rd = readsAtHetSites.get(readName); + if (rd == null) { + rd = new PhasingRead(listHetGenotypes.size(), rb.mappingQual); + readsAtHetSites.put(readName, rd); + } + else if (outputMultipleBaseCountsWriter != null && rd.getBase(index) != null // rd already has a base at index + && sample != null && phasingLoc != null) { + outputMultipleBaseCountsWriter.setMultipleBases(new SampleReadLocus(sample, readName, grb.loc), phasingLoc, rd.getBase(index), rb.base); + } + + // Arbitrarily updates to the last base observed for this sample and read (rb.base): + rd.updateBaseAndQuality(index, rb.base, rb.baseQual); + } + } + index++; + } + if (DEBUG) logger.debug("Number of sites in window = " + index); + + if (DEBUG && logger.isDebugEnabled()) { + logger.debug("ALL READS [phasingSiteIndex = " + phasingSiteIndex + "]:"); + for (Map.Entry nameToReads : readsAtHetSites.entrySet()) { + String rdName = nameToReads.getKey(); + PhasingRead rd = nameToReads.getValue(); + logger.debug(rd + "\t" + rdName); + } + } + } + + // Object to represent a pair of genomic sites, and all reads overlapping those 2 sites (though possibly others) + private class EdgeToReads { + private TreeMap> edgeReads; + + public EdgeToReads() { + this.edgeReads = new TreeMap>(); // implemented GraphEdge.compareTo() + } + + public void addRead(PhasingGraphEdge e, String readName) { + List reads = edgeReads.get(e); + if (reads == null) { + reads = new LinkedList(); + edgeReads.put(e, reads); + } + reads.add(readName); + } + + public List getReads(PhasingGraphEdge e) { + return edgeReads.get(e); + } + } + + private class IntegerSet implements Iterable { + private Set list; + + public IntegerSet(Set list) { + this.list = list; + } + + public boolean contains(int i) { + return list.contains(i); + } + + public Iterator iterator() { + return list.iterator(); + } + + public String toString() { + StringBuilder sb = new StringBuilder(); + for (int i : this) { + sb.append(i + ", "); + } + return sb.toString(); + } + } + + // Remove any reads that add no "connections" (PhasingGraphEdge) between pairs of het sites: + public Set removeExtraneousReads(int numHetSites) { + PhasingGraph readGraph = new PhasingGraph(numHetSites); + EdgeToReads edgeToReads = new EdgeToReads(); + Set sitesWithEdges = new TreeSet(); + + for (Map.Entry nameToReads : readsAtHetSites.entrySet()) { + String rdName = nameToReads.getKey(); + PhasingRead rd = nameToReads.getValue(); + + int[] siteInds = rd.getNonNullIndices(); + // Connect each pair of non-null sites in rd: + for (int i = 0; i < siteInds.length; i++) { + for (int j = i + 1; j < siteInds.length; j++) { + PhasingGraphEdge e = new PhasingGraphEdge(siteInds[i], siteInds[j]); + if (DEBUG) logger.debug("Read = " + rdName + " is adding edge: " + e); + readGraph.addEdge(e); + + edgeToReads.addRead(e, rdName); + + sitesWithEdges.add(e.getV1()); + sitesWithEdges.add(e.getV2()); + } + } + } + if (DEBUG) logger.debug("Read graph:\n" + readGraph); + Set keepReads = new HashSet(); + + /* Check which Reads are involved in acyclic paths from phaseRelativeToIndex to (phasingSiteIndex): + + In detail: + Every Read links EACH pair of sites for which it contains bases. Then, each such edge is added to a "site connectivity graph". + A read provides non-trivial bias toward the final haplotype decision if it participates in a path from prev ---> cur. This is tested by + considering each edge that the read contributes. For edge e=(v1,v2), if there exists a path from prev ---> v1 [that doesn't include v2] and + cur ---> v2 [that doesn't include v1], then there is a path from prev ---> cur that uses e, hence making the read significant. + By excluding each vertex's edges and then calculating connected components, we are able to make the determination, for example, + if a path exists from prev ---> v1 that excludes v2. + + Furthermore, if the path DOES use other edges that exist solely due to the read, then that's fine, since adding in the read will give those edges as well. + And, if the path uses edges from other reads, then keeping all other reads that contribute those edges + [which will happen since those edges are also in paths from prev ---> cur] is sufficient for this path to exist. + + NOTE: + If we would use NON-UNIFORM priors for the various haplotypes consistent with a margnialized haplotype, then this calculation would not be correct, since the equivalence of: + 1. The read affects the final marginal haplotype posterior probability (for general mapping and base quality values). + 2. The read has edges involved in a path from prev ---> cur. + DEPENDS STRONGLY on the fact that all haplotypes have the same EXACT prior. + + This is due to the following: + [We denote: + R = set of all reads + r = a single read + "AA + CC" = AA on top chromosome, CC on bottom chromosome] + + Note that since there are only two haplotype possibilities: + P(AA + CC | R) + P(AC + CA | R) = 1 + + Now, if we assume that all haplotypes consistent with AA + CC have the same prior probability [P(AA + CC | R)], then: + P(AA + CC | R) + = P(AAAA + CCCC | R) + ... + P(AACC + CCAA | R) + = [P(AAAA + CCCC , R) + ... + P(AACC + CCAA , R)] / P(R) + \propto P(AAAA + CCCC , R) + ... + P(AACC + CCAA , R) + = P(R | AAAA + CCCC)*P(AAAA + CCCC) + ... + P(R | AACC + CCAA)*P(AACC + CCAA) + = P(AA + CC | R) * [P(R | AAAA + CCCC) + ... + P(R | AACC + CCAA)] + + Since we assume independence between reads given a particular haplotype [P(R | AAAA + CCCC) = \prod_r P(r | AAAA + CCCC)], + a new read r affects P(AA + CC | R) by multiplying each of the terms in the sum by, e.g., P(r | AAAA + CCCC). + Therefore, if these values do not affect the ratio of: + (I) [P(R | AAAA + CCCC) + ... + P(R | AACC + CCAA)] / [P(R | ACAA + CACC) + ... + P(R | ACCC + CAAA)] + then they do not affect the value of: + (II) P(AA + CC | R) / P(AC + CA | R) [which uniquely defines their values, since they sum to 1] + + And, the P(r | AAAA + CCCC), ..., P(r | ACCC + CAAA) do not affect ratio (I) iff r's edges do not take part in a path from prev to cur in combination with the other reads in R. + */ + int prev = phaseRelativeToIndex; + int cur = phasingSiteIndex; + + if (!readGraph.getConnectedComponents().inSameSet(prev, cur)) { // There is NO path between cur and prev + if (DEBUG) + logger.debug("NO READ PATH between PHASE site [" + cur + "] and UPSTREAM site [" + prev + "]"); + readsAtHetSites.clear(); + return keepReads; + } + + /* Check the connected components of prev and cur when removing each individual vertex's edges: + [Total run-time: for each vertex, calculate connected components after removing it's edges: O(V * E)] + */ + IntegerSet[] removedSiteSameCCAsPrev = new IntegerSet[numHetSites]; + IntegerSet[] removedSiteSameCCAsCur = new IntegerSet[numHetSites]; + for (int i : sitesWithEdges) { + if (DEBUG) logger.debug("Calculating CC after removing edges of site: " + i); + + // Remove all edges incident to i and see which positions have paths to prev and cur: + Collection removedEdges = readGraph.removeAllIncidentEdges(i); + + // Run-time for efficiently calculating connected components using DisjointSet: O(E) + DisjointSet ccAfterRemove = readGraph.getConnectedComponents(); + removedSiteSameCCAsPrev[i] = new IntegerSet(ccAfterRemove.inSameSetAs(prev, sitesWithEdges)); + removedSiteSameCCAsCur[i] = new IntegerSet(ccAfterRemove.inSameSetAs(cur, sitesWithEdges)); + + if (DEBUG) logger.debug("Same CC as previous [" + prev + "]: " + removedSiteSameCCAsPrev[i]); + if (DEBUG) logger.debug("Same CC as current [" + cur + "]: " + removedSiteSameCCAsCur[i]); + + // Add the removed edges back in: + readGraph.addEdges(removedEdges); + } + + for (PhasingGraphEdge e : readGraph) { + if (DEBUG) logger.debug("Testing the path-connectivity of Edge: " + e); + + /* Edge e={v1,v2} contributes a path between prev and cur for testRead iff: + testRead[v1] != null, testRead[v2] != null, and there is a path from prev ---> v1 -> v2 ---> cur [or vice versa]. + Note that the path from prev ---> v1 will NOT contain v2, since we removed all of v2's edges, + and the path from v2 ---> cur will NOT contain v1. + */ + boolean prevTo2and1ToCur = removedSiteSameCCAsPrev[e.getV1()].contains(e.getV2()) && removedSiteSameCCAsCur[e.getV2()].contains(e.getV1()); + boolean prevTo1and2ToCur = removedSiteSameCCAsPrev[e.getV2()].contains(e.getV1()) && removedSiteSameCCAsCur[e.getV1()].contains(e.getV2()); + + if (prevTo2and1ToCur || prevTo1and2ToCur) { + for (String readName : edgeToReads.getReads(e)) { + keepReads.add(readName); + + if (DEBUG && logger.isDebugEnabled()) { + if (prevTo2and1ToCur) + logger.debug("Keep read " + readName + " due to path: " + prev + " ---> " + e.getV2() + " -> " + e.getV1() + " ---> " + cur); + else + logger.debug("Keep read " + readName + " due to path: " + prev + " ---> " + e.getV1() + " -> " + e.getV2() + " ---> " + cur); + } + } + } + } + + // Retain only the reads that contain an edge in a path connecting prev and cur: + Iterator> readIt = readsAtHetSites.entrySet().iterator(); + while (readIt.hasNext()) { + Map.Entry nameToReads = readIt.next(); + String rdName = nameToReads.getKey(); + if (!keepReads.contains(rdName)) { + readIt.remove(); + if (DEBUG) logger.debug("Removing extraneous read: " + rdName); + } + } + + return keepReads; + } + + // Remove all het sites that have no reads (which may occur if all of the reads supporting the original call don't contain an additional het site and were thus removed above): + private List removeExtraneousSites(List listHetGenotypes) { + Set sitesWithReads = new HashSet(); + for (Map.Entry nameToReads : readsAtHetSites.entrySet()) { + PhasingRead rd = nameToReads.getValue(); + for (int i : rd.getNonNullIndices()) + sitesWithReads.add(i); + } + + // Remove all sites that have no read bases: + List keepHetSites = new LinkedList(); + int index = 0; + int numPrecedingPhaseRelativeToSiteRemoved = 0; + int numPrecedingPhasingSiteRemoved = 0; + for (GenotypeAndReadBases grb : listHetGenotypes) { + boolean keepSite = sitesWithReads.contains(index); + if (DEBUG && logger.isDebugEnabled() && !keepSite) + logger.debug("Removing read-less site " + grb.loc); + + if (keepSite || index == phasingSiteIndex || index == phaseRelativeToIndex) { + keepHetSites.add(grb); + if (!keepSite) + if (DEBUG) + logger.debug("Although current or previous sites have no relevant reads, continuing empty attempt to phase them [for sake of program flow]..."); + } + else { + if (index <= phaseRelativeToIndex) + numPrecedingPhaseRelativeToSiteRemoved++; + if (index <= phasingSiteIndex) + numPrecedingPhasingSiteRemoved++; + } + + index++; + } + + phaseRelativeToIndex -= numPrecedingPhaseRelativeToSiteRemoved; + phasingSiteIndex -= numPrecedingPhasingSiteRemoved; + return keepHetSites; + } + + /* Auxilary object to sort candidate het sites with which to phase the index site, + where sorting is performed based on distance to the index site + (since presumably closer sites will have greater numbers of overlapping reads) + */ + private class SortSitesBySumOfDist implements Comparator { + private Vector grb; + + public SortSitesBySumOfDist(List listHetGenotypes) { + grb = new Vector(listHetGenotypes); + } + + public int compare(Integer i1, Integer i2) { + int d1 = calcGenomicDist(i1); + int d2 = calcGenomicDist(i2); + + if (d1 != d2) + return d1 - d2; + + int id1 = calcIndexDist(i1); + int id2 = calcIndexDist(i2); + if (id1 != id2) + return id1 - id2; + + return i1 - i2; + } + + private int calcGenomicDist(int i) { + int d1 = grb.get(i).loc.distance(grb.get(phaseRelativeToIndex).loc); + int d2 = grb.get(i).loc.distance(grb.get(phasingSiteIndex).loc); + + return d1 + d2; + } + + private int calcIndexDist(int i) { + int d1 = Math.abs(i - phaseRelativeToIndex); + int d2 = Math.abs(i - phasingSiteIndex); + + return d1 + d2; + } + } + + // Create a "phasing window" of het sites to use for phasing the index site, but limiting to only maxPhaseSites het sites to incorporate [as specified by the user] + private List trimWindow(List listHetGenotypes, String sample, GenomeLoc phaseLocus) { + if (DEBUG) + logger.warn("Trying to phase sample " + sample + " at locus " + phaseLocus + " within a window of " + cacheWindow + " bases yields " + listHetGenotypes.size() + " heterozygous sites to phase:\n" + toStringGRL(listHetGenotypes)); + + Set scoreAllIndices = new TreeSet(new SortSitesBySumOfDist(listHetGenotypes)); + for (int i = 0; i < listHetGenotypes.size(); ++i) { + if (i != phaseRelativeToIndex && i != phasingSiteIndex) + scoreAllIndices.add(i); + } + + Set keepIndices = new TreeSet(); + // always keep these two indices: + keepIndices.add(phaseRelativeToIndex); + keepIndices.add(phasingSiteIndex); + for (int addInd : scoreAllIndices) { + if (keepIndices.size() >= maxPhaseSites) + break; + else // keepIndices.size() < maxPhaseSites + keepIndices.add(addInd); + } + + List newListHetGenotypes = new LinkedList(); + int newPhaseRelativeToIndex = -1; + int newPhasingSiteIndex = -1; + int oldIndex = 0; + int newIndex = 0; + for (GenotypeAndReadBases grb : listHetGenotypes) { + if (keepIndices.contains(oldIndex)) { + newListHetGenotypes.add(grb); + + if (oldIndex == phaseRelativeToIndex) + newPhaseRelativeToIndex = newIndex; + if (oldIndex == phasingSiteIndex) + newPhasingSiteIndex = newIndex; + + ++newIndex; + } + ++oldIndex; + } + + phaseRelativeToIndex = newPhaseRelativeToIndex; + phasingSiteIndex = newPhasingSiteIndex; + listHetGenotypes = newListHetGenotypes; + if (DEBUG) + logger.warn("NAIVELY REDUCED to " + listHetGenotypes.size() + " sites:\n" + toStringGRL(listHetGenotypes)); + + return listHetGenotypes; + } + } + + // Phase a particular sample's het genotype using a constructed PhasingWindow: + private PhaseResult phaseSampleAtSite(PhasingWindow phaseWindow) { + /* Will map a phase and its "complement" to a single representative phase, + and marginalizeAsNewTable() marginalizes to 2 positions [starting at the previous position, and then the current position]: + */ + int[] marginalizeInds = {phaseWindow.phaseRelativeToIndex, phaseWindow.phasingSiteIndex}; + HaplotypeTableCreator tabCreator = new TableCreatorOfHaplotypeAndComplementForDiploidAlleles(phaseWindow.hetGenotypes, marginalizeInds); + PhasingTable sampleHaps = tabCreator.getNewTable(); + + if (DEBUG && logger.isDebugEnabled()) { + logger.debug("Number of USED reads [connecting the two positions to be phased] at sites: " + phaseWindow.readsAtHetSites.size()); + logger.debug("USED READS:"); + for (Map.Entry nameToReads : phaseWindow.readsAtHetSites.entrySet()) { + String rdName = nameToReads.getKey(); + PhasingRead rd = nameToReads.getValue(); + logger.debug(rd + "\t" + rdName); + } + } + + // Update the phasing table based on each of the sub-reads for this sample: + MaxHaplotypeAndQuality prevMaxHapAndQual = null; + + int numHighQualityIterations = 0; + int numInconsistentIterations = 0; + + double totalAbsPQchange = 0; + int numPQchangesObserved = 0; + + for (Map.Entry nameToReads : phaseWindow.readsAtHetSites.entrySet()) { + PhasingRead rd = nameToReads.getValue(); + if (DEBUG) logger.debug("\nrd = " + rd + "\tname = " + nameToReads.getKey()); + + for (PhasingTable.PhasingTableEntry pte : sampleHaps) { + PhasingScore score = rd.matchHaplotypeClassScore(pte.getHaplotypeClass()); + pte.getScore().integrateReadScore(score); + if (DEBUG) logger.debug("score(" + rd + ", " + pte.getHaplotypeClass() + ") = " + score); + } + + // Check the current best haplotype assignment and compare it to the previous one: + MaxHaplotypeAndQuality curMaxHapAndQual = new MaxHaplotypeAndQuality(sampleHaps, false); + if (DEBUG) + logger.debug("CUR MAX hap:\t" + curMaxHapAndQual.maxEntry.getHaplotypeClass() + "\tcurPhaseQuality:\t" + curMaxHapAndQual.phaseQuality); + if (prevMaxHapAndQual != null) { + double changeInPQ = prevMaxHapAndQual.phaseQuality - curMaxHapAndQual.phaseQuality; + + if (passesPhasingThreshold(prevMaxHapAndQual.phaseQuality)) { + numHighQualityIterations++; + if (!curMaxHapAndQual.hasSameRepresentativeHaplotype(prevMaxHapAndQual) || // switched phase + (numPQchangesObserved > 0 && changeInPQ > FRACTION_OF_MEAN_PQ_CHANGES * (totalAbsPQchange / numPQchangesObserved))) { // a "significant" decrease in PQ + if (DEBUG) logger.debug("Inconsistent read found!"); + numInconsistentIterations++; + } + } + + totalAbsPQchange += Math.abs(changeInPQ); + numPQchangesObserved++; + } + prevMaxHapAndQual = curMaxHapAndQual; + } + + if (DEBUG) logger.debug("\nPhasing table [AFTER CALCULATION]:\n" + sampleHaps + "\n"); + MaxHaplotypeAndQuality maxHapQual = new MaxHaplotypeAndQuality(sampleHaps, DEBUG); + double posteriorProb = maxHapQual.maxEntry.getScore().getValue(); + + if (DEBUG) + logger.debug("MAX hap:\t" + maxHapQual.maxEntry.getHaplotypeClass() + "\tposteriorProb:\t" + posteriorProb + "\tphaseQuality:\t" + maxHapQual.phaseQuality); + if (DEBUG) + logger.debug("Number of used reads " + phaseWindow.readsAtHetSites.size() + "; number of high PQ iterations " + numHighQualityIterations + "; number of inconsistencies " + numInconsistentIterations); + + boolean phasingContainsInconsistencies = false; + if (numInconsistentIterations / (double) numHighQualityIterations > MAX_FRACTION_OF_INCONSISTENT_READS) + phasingContainsInconsistencies = true; + + return new PhaseResult(maxHapQual.getRepresentative(), maxHapQual.phaseQuality, phasingContainsInconsistencies); + } + + // Object represents the maximum-scoring haplotype and its corresponding quality score + private static class MaxHaplotypeAndQuality { + public PhasingTable.PhasingTableEntry maxEntry; + public double phaseQuality; + + public MaxHaplotypeAndQuality(PhasingTable hapTable, boolean printDebug) { + // Marginalize each haplotype to its first 2 positions: + hapTable = HaplotypeTableCreator.marginalizeAsNewTable(hapTable); + if (printDebug) + logger.debug("\nPhasing table [AFTER MAPPING]:\n" + hapTable + "\n"); + + calculateMaxHapAndPhasingQuality(hapTable, printDebug); + } + + // Calculates maxEntry and its PQ (within table hapTable): + private void calculateMaxHapAndPhasingQuality(PhasingTable hapTable, boolean printDebug) { + hapTable.normalizeScores(); + if (printDebug) + logger.debug("\nPhasing table [AFTER NORMALIZATION]:\n" + hapTable + "\n"); + + // Determine the phase at this position: + this.maxEntry = hapTable.maxEntry(); + + // convert posteriorProb to PHRED scale, but do NOT cap the quality as in QualityUtils.trueProbToQual(posteriorProb): + PreciseNonNegativeDouble sumErrorProbs = new PreciseNonNegativeDouble(ZERO); + for (PhasingTable.PhasingTableEntry pte : hapTable) { + if (pte != maxEntry) + sumErrorProbs.plusEqual(pte.getScore()); + } + this.phaseQuality = -10.0 * (sumErrorProbs.getLog10Value()); + } + + // Comparator that compares if 2 haplotypes map back to the same "representative" haplotype (accounts for reverse complementarity) + public boolean hasSameRepresentativeHaplotype(MaxHaplotypeAndQuality that) { + return this.getRepresentative().equals(that.getRepresentative()); + } + + private Haplotype getRepresentative() { + return maxEntry.getHaplotypeClass().getRepresentative(); + } + } + + /* + Ensure that curAllelePair is phased relative to prevAllelePair as specified by hap. + */ + + public static String[] ensurePhasing(SNPallelePair curAllelePair, SNPallelePair prevAllelePair, String[] prevPairNames, Haplotype hap) { + if (hap.size() < 2) + throw new ReviewedStingException("LOGICAL ERROR: Only considering haplotypes of length > 2!"); + + String[] curPairNames = prevPairNames; + + byte prevBase = hap.getBase(0); // The 1st base in the haplotype + byte curBase = hap.getBase(1); // The 2nd base in the haplotype + + boolean chosePrevTopChrom = prevAllelePair.matchesTopBase(prevBase); + boolean choseCurTopChrom = curAllelePair.matchesTopBase(curBase); + if (chosePrevTopChrom != choseCurTopChrom) { + //curAllelePair.swapAlleles(); + + /* Instead of swapping the alleles (as we used to above), + we swap the haplotype names to fit the unswapped alleles as they are ordered in the Genotype: + */ + curPairNames = new String[]{prevPairNames[1], prevPairNames[0]}; + } + + return curPairNames; + } + + private boolean startDistancesAreInWindowRange(VariantContext vc1, VariantContext vc2) { + return startDistancesAreInWindowRange(GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc1), GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc2)); + } + + private boolean startDistancesAreInWindowRange(GenomeLoc loc1, GenomeLoc loc2) { + return loc1.distance(loc2) <= cacheWindow; // distance() checks: loc1.onSameContig(loc2) + } + + private int startDistance(GenomeLoc gl1, VariantContext vc2) { + return gl1.distance(GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc2)); + } + + public PhasingStats reduce(PhasingStatsAndOutput statsAndList, PhasingStats stats) { + if (statsAndList != null) { + writeVcList(statsAndList.output); + stats.addIn(statsAndList.ps); + } + return stats; + } + + /** + * Phase anything left in the cached unphasedSiteQueue, and report the number of reads and VariantContexts processed. + * + * @param result the number of reads and VariantContexts seen. + */ + public void onTraversalDone(PhasingStats result) { + List finalList = processQueue(result, true); // process all remaining data + writeVcList(finalList); + writer.close(); + + if (statsWriter != null) + statsWriter.close(); + + if (outputMultipleBaseCountsWriter != null) + outputMultipleBaseCountsWriter.close(); + + System.out.println("Coverage over ALL samples:"); + System.out.println("Number of reads observed: " + result.getNumReads()); + System.out.println("Number of variant sites observed: " + result.getNumVarSites()); + System.out.println("Average coverage: " + ((double) result.getNumReads() / result.getNumVarSites())); + + System.out.println("\n--- Phasing summary [minimal haplotype quality (PQ): " + phaseQualityThresh + ", maxPhaseSites: " + maxPhaseSites + ", cacheWindow: " + cacheWindow + "] ---"); + for (Map.Entry sampPhaseCountEntry : result.getPhaseCounts()) { + PhaseCounts pc = sampPhaseCountEntry.getValue(); + System.out.print("Sample: " + sampPhaseCountEntry.getKey() + "\tSites tested: " + pc.numTestedSites + "\tSites phased: " + pc.numPhased); + System.out.println("\tPhase-inconsistent sites: " + (pc.numInconsistentSitesPhased + pc.numInconsistentSitesNotPhased) + " [phased: " + pc.numInconsistentSitesPhased + ", unphased:" + pc.numInconsistentSitesNotPhased + "]"); + } + System.out.println(""); + } + + private void writeVcList(List varContList) { + for (VariantContext vc : varContList) + writeVCF(vc); + } + + private void writeVCF(VariantContext vc) { + if (samplesToPhase == null || vc.isNotFiltered()) + //if ( samplesToPhase == null || (vc.isVariant() && vc.isNotFiltered())) // if we are only operating on specific samples, don't write out all sites, just those where the VC is variant + writer.add(vc); + } + + public static boolean processVariantInPhasing(VariantContext vc) { + return vc.isNotFiltered() && ((vc.isSNP() && vc.isBiallelic()) || !vc.isVariant()); // we can handle the non-variant case as well + //return isUnfilteredBiallelicSNP(vc); + } + + + /* + Inner classes: + */ + + // A variant and the reads for each sample at that site: + private class VariantAndReads { + public VariantContext variant; + public HashMap sampleReadBases; + + public VariantAndReads(VariantContext variant, HashMap sampleReadBases) { + this.variant = variant; + this.sampleReadBases = sampleReadBases; + } + + public VariantAndReads(VariantContext variant, AlignmentContext alignment) { + this.variant = variant; + this.sampleReadBases = new HashMap(); + + if (alignment != null) { + ReadBackedPileup pileup = alignment.getBasePileup(); + if (pileup != null) { + // filter the read-base pileup based on min base and mapping qualities: + pileup = pileup.getBaseAndMappingFilteredPileup(MIN_BASE_QUALITY_SCORE, MIN_MAPPING_QUALITY_SCORE); + if (pileup != null) { + for (final String sample : pileup.getSamples()) { + ReadBackedPileup samplePileup = pileup.getPileupForSample(sample); + ReadBasesAtPosition readBases = new ReadBasesAtPosition(); + for (PileupElement p : samplePileup) { + if (!p.isDeletion()) // IGNORE deletions for now + readBases.putReadBase(p); + } + sampleReadBases.put(sample, readBases); + } + } + } + } + } + } + + // Object to represent a variant that has yet to be phased, along with its underlying base pileups: + private class UnfinishedVariantAndReads { + public UnfinishedVariantContext unfinishedVariant; + public HashMap sampleReadBases; + + public UnfinishedVariantAndReads(VariantAndReads vr) { + this.unfinishedVariant = new UnfinishedVariantContext(vr.variant); + this.sampleReadBases = vr.sampleReadBases; + } + } + + // COULD replace with MutableVariantContext if it worked [didn't throw exceptions when trying to call its set() methods]... + + private class UnfinishedVariantContext implements HasGenomeLocation { + private String name; + private String contig; + private int start; + private int stop; + private Collection alleles; + private Map genotypes; + private double log10PError; + private Set filters; + private Map attributes; + private String id; + + public UnfinishedVariantContext(VariantContext vc) { + this.name = vc.getSource(); + this.id = vc.getID(); + this.contig = vc.getChr(); + this.start = vc.getStart(); + this.stop = vc.getEnd(); + this.alleles = vc.getAlleles(); + + this.genotypes = new HashMap(); + for ( final Genotype g : vc.getGenotypes() ) { + this.genotypes.put(g.getSampleName(), g); + } + + this.log10PError = vc.getLog10PError(); + this.filters = vc.filtersWereApplied() ? vc.getFilters() : null; + this.attributes = new HashMap(vc.getAttributes()); + } + + public VariantContext toVariantContext() { + GenotypesContext gc = GenotypesContext.copy(this.genotypes.values()); + return new VariantContextBuilder(name, contig, start, stop, alleles).id(id) + .genotypes(gc).log10PError(log10PError).filters(filters).attributes(attributes).make(); + } + + public GenomeLoc getLocation() { + return getToolkit().getGenomeLocParser().createGenomeLoc(contig, start, stop); + } + + public Genotype getGenotype(String sample) { + return genotypes.get(sample); + } + + public void setGenotype(String sample, Genotype newGt) { + this.genotypes.put(sample, newGt); + } + + public void setPhasingInconsistent() { + attributes.put(PHASING_INCONSISTENT_KEY, true); + } + } + + private static String toStringGRL(List grbList) { + boolean first = true; + StringBuilder sb = new StringBuilder(); + for (GenotypeAndReadBases grb : grbList) { + if (first) + first = false; + else + sb.append(" -- "); + + sb.append(grb.loc); + } + return sb.toString(); + } + + private String toStringVCL(List vcList) { + boolean first = true; + StringBuilder sb = new StringBuilder(); + for (VariantContext vc : vcList) { + if (first) + first = false; + else + sb.append(" -- "); + + sb.append(GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc)); + } + return sb.toString(); + } + +// +// THIS IMPLEMENTATION WILL FAIL WHEN NOT DEALING WITH SNP Alleles (e.g., MNP or INDEL), SINCE THEN THE Allele.getBases() +// FUNCTION WILL RETURN VARIABLE-LENGTH Byte ARRAYS. IN THAT CASE, BaseArray/Haplotype/Read WILL NEED TO BE REPLACED WITH +// AN ArrayList OF Allele [OR SIMILAR OBJECT], and WON'T USE: getSingleBase(alleleI) +// + + /* Creates table of all 2^n local haplotypes, + where n is the number of heterozygous SNPs in the local region we expected to find phase-informative reads + */ + private static abstract class HaplotypeTableCreator { + protected Genotype[] genotypes; + + public HaplotypeTableCreator(Genotype[] hetGenotypes) { + this.genotypes = hetGenotypes; + } + + abstract public PhasingTable getNewTable(); + + protected List getAllHaplotypes() { + int numSites = genotypes.length; + int[] genotypeCards = new int[numSites]; + for (int i = 0; i < numSites; i++) + genotypeCards[i] = genotypes[i].getPloidy(); + + LinkedList allHaps = new LinkedList(); + CardinalityCounter alleleCounter = new CardinalityCounter(genotypeCards); + for (int[] alleleInds : alleleCounter) { + byte[] hapBases = new byte[numSites]; + for (int i = 0; i < numSites; i++) { + Allele alleleI = genotypes[i].getAllele(alleleInds[i]); + hapBases[i] = SNPallelePair.getSingleBase(alleleI); + } + allHaps.add(new Haplotype(hapBases)); + } + return allHaps; + } + + /* For phasing site X relative to site X-1, we sum the probabilities over all haplotypes of the phases of [X-1, X]. + That is, we aggregate probability mass over all haplotypes consistent with a particular phase at the [X-1, X] pair. + */ + public static PhasingTable marginalizeAsNewTable(PhasingTable table) { + TreeMap hapMap = new TreeMap(); + for (PhasingTable.PhasingTableEntry pte : table) { + Haplotype rep = pte.getHaplotypeClass().getRepresentative(); + PreciseNonNegativeDouble score = hapMap.get(rep); + if (score == null) { + score = new PreciseNonNegativeDouble(ZERO); + hapMap.put(rep, score); + } + score.plusEqual(pte.getScore()); + } + + PhasingTable margTable = new PhasingTable(); + for (Map.Entry hapClassAndScore : hapMap.entrySet()) { + Haplotype rep = hapClassAndScore.getKey(); + ArrayList hapList = new ArrayList(); + hapList.add(rep); + + HaplotypeClass hc = new HaplotypeClass(hapList, rep); + margTable.addEntry(hc, hapClassAndScore.getValue()); + } + return margTable; + } + } + + // Implementation for diploid alleles (thus assuming 2^n haplotypes): + private static class TableCreatorOfHaplotypeAndComplementForDiploidAlleles extends HaplotypeTableCreator { + private SNPallelePair[] SNPallelePairs; + Set marginalizeInds; + + public TableCreatorOfHaplotypeAndComplementForDiploidAlleles(Genotype[] hetGenotypes, int[] marginalizeInds) { + super(hetGenotypes); + + this.SNPallelePairs = new SNPallelePair[genotypes.length]; + for (int i = 0; i < genotypes.length; i++) + SNPallelePairs[i] = new SNPallelePair(genotypes[i]); + + this.marginalizeInds = new TreeSet(); + for (int mind : marginalizeInds) + this.marginalizeInds.add(mind); + } + + public PhasingTable getNewTable() { + int startIndex = marginalizeInds.iterator().next(); + + PhasingTable table = new PhasingTable(); + for (Haplotype hap : getAllHaplotypes()) { + if (SNPallelePairs[startIndex].matchesTopBase(hap.getBase(startIndex))) { + /* hap is the "representative" haplotype [DEFINED here to be + the one with the top base at the startIndex position. + NOTE that it is CRITICAL that this definition be consistent with the representative sub-haplotypes defined below!] + */ + ArrayList hapList = new ArrayList(); + hapList.add(hap); + hapList.add(complement(hap)); + + Haplotype rep = hap.subHaplotype(marginalizeInds); + double hapClassPrior = getHaplotypeRepresentativePrior(rep); // Note that prior is ONLY a function of the representative haplotype + + HaplotypeClass hapClass = new HaplotypeClass(hapList, rep); + table.addEntry(hapClass, hapClassPrior); + } + } + return table; + } + + // Can change later to weight the representative Haplotypes differently: + + private double getHaplotypeRepresentativePrior(Haplotype rep) { + return 1.0; + } + + /* Since assuming biallelic genotypes, we use this to map a haplotype to the corresponding haplotype, + where the other allele is chosen at each site + */ + private Haplotype complement(Haplotype hap) { + int numSites = SNPallelePairs.length; + if (hap.size() != numSites) + throw new ReviewedStingException("INTERNAL ERROR: hap.size() != numSites"); + + // Take the other base at EACH position of the Haplotype: + byte[] complementBases = new byte[numSites]; + for (int i = 0; i < numSites; i++) + complementBases[i] = SNPallelePairs[i].getOtherBase(hap.getBase(i)); + + return new Haplotype(complementBases); + } + } + + // Table to represent the list of all haplotypes and their scores: + private static class PhasingTable implements Iterable { + private LinkedList table; + + public PhasingTable() { + this.table = new LinkedList(); + } + + public PhasingTableEntry addEntry(HaplotypeClass haplotypeClass, PreciseNonNegativeDouble initialScore) { + PhasingTableEntry pte = new PhasingTableEntry(haplotypeClass, new PhasingScore(initialScore)); + table.add(pte); + return pte; + } + + public PhasingTableEntry addEntry(HaplotypeClass haplotypeClass, double initialScore) { + return addEntry(haplotypeClass, new PreciseNonNegativeDouble(initialScore)); + } + + public Iterator iterator() { + return table.iterator(); + } + + public boolean isEmpty() { + return table.isEmpty(); + } + + public PhasingTableEntry maxEntry() { + if (table.isEmpty()) + return null; + + PhasingTableEntry maxPte = null; + for (PhasingTableEntry pte : table) { + if (maxPte == null || pte.getScore().gt(maxPte.getScore())) { + maxPte = pte; + } + } + return maxPte; + } + + // Normalize all the scores of the phasing table by their sum total: + public void normalizeScores() { + PreciseNonNegativeDouble normalizeBy = new PreciseNonNegativeDouble(ZERO); + for (PhasingTableEntry pte : table) + normalizeBy.plusEqual(pte.getScore()); + + if (!normalizeBy.equals(ZERO)) { // prevent precision problems + for (PhasingTableEntry pte : table) + pte.getScore().divEqual(normalizeBy); + } + } + + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("-------------------\n"); + for (PhasingTableEntry pte : this) { + sb.append("Haplotypes:\t" + pte.getHaplotypeClass() + "\tScore:\t" + pte.getScore() + "\n"); + } + sb.append("-------------------\n"); + return sb.toString(); + } + + // An entry in the phasing table for a particular set of equivalent haplotypes (e.g., a haplotype and its "complement" -- see above) + public static class PhasingTableEntry implements Comparable { + private HaplotypeClass haplotypeClass; + private PhasingScore score; + + public PhasingTableEntry(HaplotypeClass haplotypeClass, PhasingScore score) { + this.haplotypeClass = haplotypeClass; + this.score = score; + } + + public HaplotypeClass getHaplotypeClass() { + return haplotypeClass; + } + + public PhasingScore getScore() { + return score; + } + + public int compareTo(PhasingTableEntry that) { + return this.getScore().compareTo(that.getScore()); + } + } + } + + private static class PhaseResult { + public Haplotype haplotype; + public double phaseQuality; + public boolean phasingContainsInconsistencies; + + public PhaseResult(Haplotype haplotype, double phaseQuality, boolean phasingContainsInconsistencies) { + this.haplotype = haplotype; + this.phaseQuality = phaseQuality; + this.phasingContainsInconsistencies = phasingContainsInconsistencies; + } + } + + public static boolean isUnfilteredBiallelicSNP(VariantContext vc) { + return (vc.isNotFiltered() && vc.isSNP() && vc.isBiallelic()); + } + + public static boolean isUnfilteredCalledDiploidGenotype(Genotype gt) { + return (! gt.isFiltered() && gt.isCalled() && gt.getPloidy() == 2); + } + + // Class to output verbose information on instances where a single read has multiple bases at the same position (e.g., from paired-end overlap with a base error): + private class MultipleBaseCountsWriter { + private BufferedWriter writer = null; + private TreeMap multipleBaseCounts = null; + + public MultipleBaseCountsWriter(File outputMultipleBaseCountsFile) { + FileOutputStream output; + try { + output = new FileOutputStream(outputMultipleBaseCountsFile); + } catch (FileNotFoundException e) { + throw new RuntimeException("Unable to create multiple base count file at location: " + outputMultipleBaseCountsFile); + } + this.writer = new BufferedWriter(new OutputStreamWriter(output)); + + this.multipleBaseCounts = new TreeMap(); // implemented SampleReadLocus.compareTo() + } + + public void setMultipleBases(SampleReadLocus srl, GenomeLoc phasingLoc, byte prevBase, byte newBase) { + MultipleBaseCounts mbc = multipleBaseCounts.get(srl); + if (mbc == null) { + mbc = new MultipleBaseCounts(phasingLoc); + mbc.incrementBaseCount(prevBase); // only now, do we know to note this + multipleBaseCounts.put(srl, mbc); + } + if (mbc.samePhasingLocAs(phasingLoc)) // otherwise, don't want to count these multiple base counts again + mbc.incrementBaseCount(newBase); + + } + + public void outputMultipleBaseCounts() { + GenomeLoc nextToPhaseLoc = null; + if (!unphasedSiteQueue.isEmpty()) + nextToPhaseLoc = GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), unphasedSiteQueue.peek().variant); + + outputMultipleBaseCounts(nextToPhaseLoc); + } + + private void outputMultipleBaseCounts(GenomeLoc nextToPhaseLoc) { + try { + Iterator> multBaseCountIt = multipleBaseCounts.entrySet().iterator(); + while (multBaseCountIt.hasNext()) { + Map.Entry sampleReadLocBaseCountsEntry = multBaseCountIt.next(); + SampleReadLocus srl = sampleReadLocBaseCountsEntry.getKey(); + if (nextToPhaseLoc == null || !startDistancesAreInWindowRange(srl.getLocus(), nextToPhaseLoc)) { + // Done with entry, so print it and remove it from map: + writer.write(srl + "\t" + sampleReadLocBaseCountsEntry.getValue() + "\n"); + multBaseCountIt.remove(); + } + } + writer.flush(); + } catch (IOException e) { + throw new RuntimeException("Unable to write to outputMultipleBaseCountsFile", e); + } + } + + public void close() { + outputMultipleBaseCounts(null); + + try { + writer.flush(); + writer.close(); + } catch (IOException e) { + throw new RuntimeException("Unable to close outputMultipleBaseCountsFile"); + } + } + } +} + + +class PhasingScore extends PreciseNonNegativeDouble { + public PhasingScore(double score) { + super(score); + } + + public PhasingScore(PreciseNonNegativeDouble val) { + super(val); + } + + public PhasingScore integrateReadScore(PhasingScore score) { + timesEqual(score); + return this; + } +} + +class HaplotypeClass implements Iterable { + private ArrayList haps; + private Haplotype rep; + + public HaplotypeClass(ArrayList haps, Haplotype rep) { + this.haps = haps; + this.rep = rep; + } + + public Iterator iterator() { + return haps.iterator(); + } + + public Haplotype getRepresentative() { + return rep; + } + + public String toString() { + StringBuilder sb = new StringBuilder(); + boolean isFirst = true; + for (Haplotype h : haps) { + if (isFirst) + isFirst = false; + else + sb.append(" + "); + + sb.append(h); + } + sb.append(" [").append(rep).append("]"); + return sb.toString(); + } +} + +// Summary statistics about phasing rates, for each sample +class PhasingStats { + private int numReads; + private int numVarSites; + + // Map of: sample -> PhaseCounts: + private Map samplePhaseStats; + + public PhasingStats() { + this(new TreeMap()); + } + + public PhasingStats(int numReads, int numVarSites) { + this.numReads = numReads; + this.numVarSites = numVarSites; + this.samplePhaseStats = new TreeMap(); + } + + public PhasingStats(Map samplePhaseStats) { + this.numReads = 0; + this.numVarSites = 0; + this.samplePhaseStats = samplePhaseStats; + } + + public void addIn(PhasingStats other) { + this.numReads += other.numReads; + this.numVarSites += other.numVarSites; + + for (Map.Entry sampPhaseEntry : other.samplePhaseStats.entrySet()) { + String sample = sampPhaseEntry.getKey(); + PhaseCounts otherCounts = sampPhaseEntry.getValue(); + PhaseCounts thisCounts = this.samplePhaseStats.get(sample); + if (thisCounts == null) { + thisCounts = new PhaseCounts(); + this.samplePhaseStats.put(sample, thisCounts); + } + thisCounts.addIn(otherCounts); + } + } + + public int getNumReads() { + return numReads; + } + + public int getNumVarSites() { + return numVarSites; + } + + public Collection> getPhaseCounts() { + return samplePhaseStats.entrySet(); + } +} + +class PhaseCounts { + public int numTestedSites; // number of het sites directly succeeding het sites + public int numInconsistentSitesPhased; + public int numInconsistentSitesNotPhased; + public int numPhased; + + public PhaseCounts() { + this.numTestedSites = 0; + this.numInconsistentSitesPhased = 0; + this.numInconsistentSitesNotPhased = 0; + this.numPhased = 0; + } + + public void addIn(PhaseCounts other) { + this.numTestedSites += other.numTestedSites; + this.numInconsistentSitesPhased += other.numInconsistentSitesPhased; + this.numInconsistentSitesNotPhased += other.numInconsistentSitesNotPhased; + this.numPhased += other.numPhased; + } +} + +class PhasingStatsAndOutput { + public PhasingStats ps; + public List output; + + public PhasingStatsAndOutput(PhasingStats ps, List output) { + this.ps = ps; + this.output = output; + } +} + +class PhasingQualityStatsWriter { + private String variantStatsFilePrefix; + private HashMap sampleToStatsWriter = new HashMap(); + + public PhasingQualityStatsWriter(String variantStatsFilePrefix) { + this.variantStatsFilePrefix = variantStatsFilePrefix; + } + + public void addStat(String sample, GenomeLoc locus, int startDistanceFromPrevious, double phasingQuality, int numReads, int windowSize) { + BufferedWriter sampWriter = sampleToStatsWriter.get(sample); + if (sampWriter == null) { + String fileName = variantStatsFilePrefix + "." + sample + ".locus_distance_PQ_numReads_windowSize.txt"; + + FileOutputStream output; + try { + output = new FileOutputStream(fileName); + } catch (FileNotFoundException e) { + throw new RuntimeException("Unable to create phasing quality stats file at location: " + fileName); + } + sampWriter = new BufferedWriter(new OutputStreamWriter(output)); + sampleToStatsWriter.put(sample, sampWriter); + } + try { + sampWriter.write(locus + "\t" + startDistanceFromPrevious + "\t" + phasingQuality + "\t" + numReads + "\t" + windowSize + "\n"); + sampWriter.flush(); + } catch (IOException e) { + throw new RuntimeException("Unable to write to per-sample phasing quality stats file", e); + } + } + + public void close() { + for (Map.Entry sampWriterEntry : sampleToStatsWriter.entrySet()) { + BufferedWriter sampWriter = sampWriterEntry.getValue(); + try { + sampWriter.flush(); + sampWriter.close(); + } catch (IOException e) { + throw new RuntimeException("Unable to close per-sample phasing quality stats file"); + } + } + } +} + +class SampleReadLocus implements Comparable { + private String sample; + private String read; + private GenomeLoc locus; + + public SampleReadLocus(String sample, String read, GenomeLoc locus) { + this.sample = sample; + this.read = read; + this.locus = locus; + } + + public GenomeLoc getLocus() { + return locus; + } + + public int compareTo(SampleReadLocus that) { + int comp = this.sample.compareTo(that.sample); + if (comp != 0) + return comp; + + comp = this.read.compareTo(that.read); + if (comp != 0) + return comp; + + return this.locus.compareTo(that.locus); + } + + public String toString() { + return "Sample " + sample + ", read " + read + ", locus " + locus; + } +} + +class MultipleBaseCounts { + private Map baseCounts; + private GenomeLoc phasingLocus; + + public MultipleBaseCounts(GenomeLoc phasingLoc) { + this.baseCounts = new HashMap(); + this.phasingLocus = phasingLoc; + } + + public boolean samePhasingLocAs(GenomeLoc loc) { + return phasingLocus.equals(loc); + } + + public void incrementBaseCount(byte base) { + int baseIndex = BaseUtils.simpleBaseToBaseIndex(base); + Integer cnt = baseCounts.get(baseIndex); + if (cnt == null) + cnt = 0; + + baseCounts.put(baseIndex, cnt + 1); + } + + public String toString() { + StringBuilder sb = new StringBuilder(); + + sb.append("Base counts"); + for (Map.Entry baseCountEntry : baseCounts.entrySet()) { + byte base = BaseUtils.baseIndexToSimpleBase(baseCountEntry.getKey()); + int cnt = baseCountEntry.getValue(); + sb.append("\t" + (char) base + ": " + cnt); + } + + return sb.toString(); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBase.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/ReadBase.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBase.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/ReadBase.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBasesAtPosition.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/ReadBasesAtPosition.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBasesAtPosition.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/ReadBasesAtPosition.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/SNPallelePair.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/SNPallelePair.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/SNPallelePair.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/phasing/SNPallelePair.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedCoverage.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedCoverage.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedCoverage.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedCoverage.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReads.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReads.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReads.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReads.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/simulatereads/SimulateReadsForVariants.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/simulatereads/SimulateReadsForVariants.java new file mode 100644 index 000000000..7054d78cd --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/simulatereads/SimulateReadsForVariants.java @@ -0,0 +1,395 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.simulatereads; + +import cern.jet.random.Poisson; +import cern.jet.random.engine.MersenneTwister; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMProgramRecord; +import net.sf.samtools.SAMReadGroupRecord; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.variant.vcf.*; +import org.broadinstitute.variant.variantcontext.*; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.text.TextFormattingUtils; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; + +import java.util.*; + +/** + * Generates simulated reads for variants + * + *

Given a set of variants, this tool will generate simulated reads that support the input variants.

+ * + *

Caveats

+ *

For practical reasons, only bi-allelic variants that are not too close to the ends of contigs (< 1/2 read length) are supported; all others will simply be ignored.

+ * + *

Input

+ *

A VCF file containing variants.

+ * + *

Output

+ *

A BAM file containing simulated sequence reads that support the input variants, with the requested error rate and coverage depth.

+ * + *

Example

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -T SimulateReadsForVariants \
+ *   -R reference.fasta \
+ *   -V input_variants.vcf \
+ *   -o simulated_reads.bam \
+ *   --readDepth 50 \
+ *   --errorRate 25
+ * 
+ * + */ +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class}, gotoDev = HelpConstants.EB) + +@Reference(window=@Window(start=-200,stop=200)) +public class SimulateReadsForVariants extends RodWalker { + + @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + /** + * The simulated reads will be written to a BAM file. + */ + @Output(doc="Reads corresponding to variants", required=true) + protected StingSAMFileWriter readWriter; + /** + * Use this argument to set the desired target read depth. See the readSamplingMode argument for options that + * determine whether coverage distribution will be exactly this value or an approximation. + */ + @Argument(fullName="readDepth", shortName="DP", doc="Read depth to generate", required=false, minValue = 0, minRecommendedValue = 1, maxRecommendedValue = 1000, maxValue = Integer.MAX_VALUE) + public int readDepth = 20; + /** + * Errors will be generated at this rate in the simulated reads. Base qualities are therefore also assigned this value. + */ + @Argument(fullName="errorRate", shortName="ER", doc="Base error rate (Phred-scaled)", required=false, minValue = 0, maxValue = Integer.MAX_VALUE) + public int phredErrorRate = 20; + /** + * All simulated reads will be exactly this length. + */ + @Argument(fullName="readLength", shortName="RL", doc="Read lengths (bp)", required=false, minValue = 1, maxValue = Integer.MAX_VALUE) + public int readLength = 101; + /** + * The corresponding platform identifier will be specified in the simulated read group PL tag. This setting does not + * affect the properties of the simulated reads. + */ + @Advanced + @Argument(fullName="rgPlatform", shortName="RGPL", doc="Sequencing platform", required=false) + public NGSPlatform rgPlatform = NGSPlatform.ILLUMINA; + /** + * This determines how read sampling is achieved, and affects the coverage distribution of simulated reads. + * CONSTANT sampling will produce uniform depth at all positions, while POISSON sampling will produce a + * distribution of coverages around the requested value. + */ + @Advanced + @Argument(fullName="readSamplingMode", shortName="RSM", doc="Sampling mode", required=false) + public ReadSamplingMode samplingMode = ReadSamplingMode.CONSTANT; + public enum ReadSamplingMode { CONSTANT, POISSON }; + + @Hidden + @Argument(fullName = "no_pg_tag", shortName = "npt", doc ="Discard program tags, for integration tests", required=false) + public boolean NO_PG_TAG = false; + + @Hidden + @Argument(fullName="verbose", shortName="verbose", doc="Verbose", required=false) + public boolean verbose = false; + + public static final String PROGRAM_RECORD_NAME = "GATK SimulateReadsForVariants"; + + // variables used to store state + private long readNameCounter = 1; + private int halfReadLength; + private double errorRate; + private byte[] readQuals; + private SAMFileHeader header = null; + + // randomness related variables + private static final long RANDOM_SEED = 1252863495; + private static final Random ran = GenomeAnalysisEngine.getRandomGenerator(); + private Poisson poissonRandom = null; + + // samples and read groups + private final Map sample2RG = new HashMap(); + + private SAMReadGroupRecord sampleRG(String name) { return sample2RG.get(name); } + + private SAMReadGroupRecord createRG(String name) { + SAMReadGroupRecord rg = new SAMReadGroupRecord(name); + rg.setPlatform(rgPlatform.getDefaultPlatform()); + rg.setSample(name); + return rg; + } + + // class to store the bases, offset, and representative CIGAR of a haplotype + private static class ArtificialHaplotype { + public final byte[] bases; + public final int offset; + public final String cigar; + + public ArtificialHaplotype(final byte[] bases, final int offset, final String cigar) { + this.bases = bases; + this.offset = offset; + this.cigar = cigar; + } + } + + @Override + public void initialize() { + + // initialize sample -> read group map + final List sampleRGs = new ArrayList(); + for ( final String sample : SampleUtils.getUniqueSamplesFromRods(getToolkit(), Arrays.asList(variantCollection.variants.getName())) ) { + final SAMReadGroupRecord rg = createRG(sample); + sampleRGs.add(rg); + sample2RG.put(sample, rg); + } + + // initialize BAM headers + header = new SAMFileHeader(); + header.setSequenceDictionary(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary()); + header.setSortOrder(SAMFileHeader.SortOrder.coordinate); + header.setReadGroups(sampleRGs); + + final SAMProgramRecord programRecord = new SAMProgramRecord(PROGRAM_RECORD_NAME); + if ( !NO_PG_TAG ) { + final ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("StingText"); + programRecord.setProgramVersion(headerInfo.getString("org.broadinstitute.sting.gatk.version")); + programRecord.setCommandLine(getToolkit().createApproximateCommandLineArgumentString(getToolkit(), this)); + } + header.setProgramRecords(Arrays.asList(programRecord)); + + readWriter.setPresorted(false); + readWriter.writeHeader(header); + + halfReadLength = readLength / 2; + errorRate = QualityUtils.qualToErrorProb((byte)phredErrorRate); + readQuals = new byte[readLength]; + Arrays.fill(readQuals, (byte)phredErrorRate); + if ( samplingMode == ReadSamplingMode.POISSON ) + poissonRandom = new Poisson(readDepth, new MersenneTwister((int)RANDOM_SEED)); + } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) // RodWalkers can make funky map calls + return 0; + + if ( ref.getLocus().getStart() < readLength || ! BaseUtils.isRegularBase(ref.getBase()) ) + return 0; + + final VariantContext vc = tracker.getFirstValue(variantCollection.variants, context.getLocation()); + if ( vc == null || !vc.isBiallelic() ) + return 0; + + if ( verbose ) logger.info(String.format("Generating reads for %s", vc)); + + generateReadsForVariant(vc, ref); + + return 1; + } + + /** + * Contstructs an artifical haplotype given an allele and original reference context + * + * @param allele the allele to model (can be reference) + * @param refLength the length of the reference allele + * @param ref the original reference context + * @return a non-null ArtificialHaplotype + */ + private ArtificialHaplotype constructHaplotype(final Allele allele, final int refLength, final ReferenceContext ref) { + + final byte[] haplotype = new byte[readLength]; + + final int alleleLength = allele.getBases().length; + final int halfAlleleLength = (alleleLength + 1) / 2; + + // this is how far back to move from the event to start copying bases + final int offset = halfReadLength - halfAlleleLength; + + // copy bases before the event + final int locusPosOnRefContext = (int)(ref.getLocus().getStart() - ref.getWindow().getStart()); + int posOnRefContext = locusPosOnRefContext - offset; + System.arraycopy(ref.getBases(), posOnRefContext, haplotype, 0, offset); + int copiedCount = offset; + + // copy the event bases + System.arraycopy(allele.getBases(), 0, haplotype, copiedCount, alleleLength); + copiedCount += alleleLength; + + // copy bases after the event + posOnRefContext = locusPosOnRefContext + refLength; + final int remainder = readLength - copiedCount; + System.arraycopy(ref.getBases(), posOnRefContext, haplotype, copiedCount, remainder); + + final String cigar; + if ( refLength == alleleLength ) + cigar = readLength + "M"; + else + cigar = (offset + 1) + "M" + Math.abs(refLength - alleleLength) + (refLength > alleleLength ? "D" : "I") + remainder + "M"; + + return new ArtificialHaplotype(haplotype, offset, cigar); + } + + /** + * Generates the artificial reads for a given variant + * + * @param vc the (bi-allelic) variant context for which to generate artificial reads + * @param ref the original reference context + */ + private void generateReadsForVariant(final VariantContext vc, final ReferenceContext ref) { + + final int refLength = vc.getReference().getBases().length; + final ArtificialHaplotype refHap = constructHaplotype(vc.getReference(), refLength, ref); + final ArtificialHaplotype altHap = constructHaplotype(vc.getAlternateAllele(0), refLength, ref); + + int gi = 0; + for ( final Genotype g : vc.getGenotypes() ) { + final int myDepth = sampleDepth(); + for ( int d = 0; d < myDepth; d++ ) { + + final ArtificialHaplotype haplotype = chooseRefHaplotype(g) ? refHap : altHap; + final byte[] readBases = Arrays.copyOf(haplotype.bases, readLength); + + addMachineErrors(readBases, errorRate); + writeRead(readBases, vc.getChr(), vc.getStart() - haplotype.offset, haplotype.cigar, g.getSampleName(), gi++ % 2 == 0); + } + } + } + + /** + * Decides whether or not to choose the reference haplotype, depending on the given genotype + * + * @param g the genotype of the given sample + * @return true if one should use the reference haplotype, false otherwise + */ + private boolean chooseRefHaplotype(final Genotype g) { + final double refP; + if ( g.isHomRef() ) refP = 1; + else if ( g.isHet() ) refP = 0.5; + else refP = 0.0; + + return ran.nextDouble() < refP; + } + + /** + * Generates the artificial read depth + * + * @return a non-negative int + */ + private int sampleDepth() { + switch ( samplingMode ) { + case CONSTANT: return readDepth; + case POISSON: return poissonRandom.nextInt(); + default: + throw new IllegalStateException("Unexpected DepthSamplingType " + samplingMode); + } + } + + /** + * Creates and writes an artificial read given the appropriate data + * + * @param readBases the bases + * @param contig the contig + * @param start the read start + * @param cigar the cigar string + * @param sample the sample name (used to get the right read group) + * @param isNegStrand should this read be on the negative strand? + */ + private void writeRead(final byte[] readBases, final String contig, final int start, + final String cigar, final String sample, final boolean isNegStrand) { + final GATKSAMRecord read = new GATKSAMRecord(header); + read.setBaseQualities(readQuals); + read.setReadBases(readBases); + read.setReadName("" + readNameCounter++); + read.setCigarString(cigar); + read.setReadPairedFlag(false); + read.setAlignmentStart(start); + read.setMappingQuality(60); + read.setReferenceName(contig); + read.setReadNegativeStrandFlag(isNegStrand); + read.setAttribute("RG", sampleRG(sample).getReadGroupId()); + + readWriter.addAlignment(read); + } + + /** + * Adds machine errors at the appropriate rate to the provided read bases + * + * @param readBases the read bases + * @param errorRate the rate at which to produce errors + */ + private void addMachineErrors(final byte[] readBases, final double errorRate) { + for ( int i = 0; i < readBases.length; i++ ) { + final double r = ran.nextDouble(); + if ( r < errorRate ) { + byte errorBase = BaseUtils.baseIndexToSimpleBase(BaseUtils.getRandomBaseIndex(BaseUtils.simpleBaseToBaseIndex(readBases[i]))); + if ( errorBase == readBases[i] ) throw new IllegalStateException("Read and error bases are the same"); + readBases[i] = errorBase; + } + } + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(Integer counter, Integer sum) { + return counter + sum; + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidate.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidate.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidate.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidate.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/FrequencyModeSelector.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/FrequencyModeSelector.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/FrequencyModeSelector.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/FrequencyModeSelector.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GTBasedSampleSelector.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GTBasedSampleSelector.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GTBasedSampleSelector.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GTBasedSampleSelector.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GenomeEvent.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GenomeEvent.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GenomeEvent.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GenomeEvent.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/KeepAFSpectrumFrequencySelector.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/KeepAFSpectrumFrequencySelector.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/KeepAFSpectrumFrequencySelector.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/KeepAFSpectrumFrequencySelector.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/NullSampleSelector.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/NullSampleSelector.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/NullSampleSelector.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/NullSampleSelector.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/SampleSelector.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/SampleSelector.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/SampleSelector.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/SampleSelector.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/UniformSamplingFrequencySelector.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/UniformSamplingFrequencySelector.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/UniformSamplingFrequencySelector.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/UniformSamplingFrequencySelector.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelector.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelector.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelector.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelector.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/MultivariateGaussian.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/MultivariateGaussian.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/MultivariateGaussian.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/MultivariateGaussian.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrainingSet.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrainingSet.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrainingSet.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrainingSet.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/Tranche.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/Tranche.java new file mode 100644 index 000000000..3741ce12d --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/Tranche.java @@ -0,0 +1,215 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantrecalibration; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.text.XReadLines; + +import java.io.*; +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Mar 10, 2011 + */ + +public class Tranche { + private static final int CURRENT_VERSION = 5; + + public double ts, minVQSLod, knownTiTv, novelTiTv; + public int numKnown,numNovel; + public String name; + public VariantRecalibratorArgumentCollection.Mode model; + + int accessibleTruthSites = 0; + int callsAtTruthSites = 0; + + public Tranche(double ts, double minVQSLod, int numKnown, double knownTiTv, int numNovel, double novelTiTv, int accessibleTruthSites, int callsAtTruthSites, VariantRecalibratorArgumentCollection.Mode model) { + this(ts, minVQSLod, numKnown, knownTiTv, numNovel, novelTiTv, accessibleTruthSites, callsAtTruthSites, model, "anonymous"); + } + + public Tranche(double ts, double minVQSLod, int numKnown, double knownTiTv, int numNovel, double novelTiTv, int accessibleTruthSites, int callsAtTruthSites, VariantRecalibratorArgumentCollection.Mode model, String name ) { + this.ts = ts; + this.minVQSLod = minVQSLod; + this.novelTiTv = novelTiTv; + this.numNovel = numNovel; + this.knownTiTv = knownTiTv; + this.numKnown = numKnown; + this.model = model; + this.name = name; + + this.accessibleTruthSites = accessibleTruthSites; + this.callsAtTruthSites = callsAtTruthSites; + + if ( ts < 0.0 || ts > 100.0) + throw new UserException("Target FDR is unreasonable " + ts); + + if ( numKnown < 0 || numNovel < 0) + throw new ReviewedStingException("Invalid tranche - no. variants is < 0 : known " + numKnown + " novel " + numNovel); + + if ( name == null ) + throw new ReviewedStingException("BUG -- name cannot be null"); + } + + private double getTruthSensitivity() { + return accessibleTruthSites > 0 ? callsAtTruthSites / (1.0*accessibleTruthSites) : 0.0; + } + + public static class TrancheTruthSensitivityComparator implements Comparator, Serializable { + @Override + public int compare(final Tranche tranche1, final Tranche tranche2) { + return Double.compare(tranche1.ts, tranche2.ts); + } + } + + @Override + public String toString() { + return String.format("Tranche ts=%.2f minVQSLod=%.4f known=(%d @ %.4f) novel=(%d @ %.4f) truthSites(%d accessible, %d called), name=%s]", + ts, minVQSLod, numKnown, knownTiTv, numNovel, novelTiTv, accessibleTruthSites, callsAtTruthSites, name); + } + + /** + * Returns an appropriately formatted string representing the raw tranches file on disk. + * + * @param tranches + * @return + */ + public static String tranchesString( final List tranches ) { + final ByteArrayOutputStream bytes = new ByteArrayOutputStream(); + final PrintStream stream = new PrintStream(bytes); + + if( tranches.size() > 1 ) + Collections.sort( tranches, new TrancheTruthSensitivityComparator() ); + + stream.println("# Variant quality score tranches file"); + stream.println("# Version number " + CURRENT_VERSION); + stream.println("targetTruthSensitivity,numKnown,numNovel,knownTiTv,novelTiTv,minVQSLod,filterName,model,accessibleTruthSites,callsAtTruthSites,truthSensitivity"); + + Tranche prev = null; + for ( Tranche t : tranches ) { + stream.printf("%.2f,%d,%d,%.4f,%.4f,%.4f,VQSRTranche%s%.2fto%.2f,%s,%d,%d,%.4f%n", + t.ts, t.numKnown, t.numNovel, t.knownTiTv, t.novelTiTv, t.minVQSLod, t.model.toString(), + (prev == null ? 0.0 : prev.ts), t.ts, t.model.toString(), t.accessibleTruthSites, t.callsAtTruthSites, t.getTruthSensitivity()); + prev = t; + } + + return bytes.toString(); + } + + private static double getDouble(Map bindings, String key, boolean required) { + if ( bindings.containsKey(key) ) { + String val = bindings.get(key); + return Double.valueOf(val); + } + else if ( required ) { + throw new UserException.MalformedFile("Malformed tranches file. Missing required key " + key); + } + else + return -1; + } + + private static int getInteger(Map bindings, String key, boolean required) { + if ( bindings.containsKey(key) ) + return Integer.valueOf(bindings.get(key)); + else if ( required ) { + throw new UserException.MalformedFile("Malformed tranches file. Missing required key " + key); + } + else + return -1; + } + + /** + * Returns a list of tranches, sorted from most to least specific, read in from file f + * + * @param f + * @return + */ + public static List readTranches(File f) { + String[] header = null; + List tranches = new ArrayList(); + + try { + for( final String line : new XReadLines(f) ) { + if ( line.startsWith("#") ) + continue; + + final String[] vals = line.split(","); + if( header == null ) { + header = vals; + if ( header.length == 5 || header.length == 8 || header.length == 10 ) + // old style tranches file, throw an error + throw new UserException.MalformedFile(f, "Unfortunately your tranches file is from a previous version of this tool and cannot be used with the latest code. Please rerun VariantRecalibrator"); + if ( header.length != 11 ) + throw new UserException.MalformedFile(f, "Expected 11 elements in header line " + line); + } else { + if ( header.length != vals.length ) + throw new UserException.MalformedFile(f, "Line had too few/many fields. Header = " + header.length + " vals " + vals.length + ". The line was: " + line); + + Map bindings = new HashMap(); + for ( int i = 0; i < vals.length; i++ ) bindings.put(header[i], vals[i]); + tranches.add(new Tranche(getDouble(bindings,"targetTruthSensitivity", true), + getDouble(bindings,"minVQSLod", true), + getInteger(bindings,"numKnown", false), + getDouble(bindings,"knownTiTv", false), + getInteger(bindings,"numNovel", true), + getDouble(bindings,"novelTiTv", true), + getInteger(bindings,"accessibleTruthSites", false), + getInteger(bindings,"callsAtTruthSites", false), + VariantRecalibratorArgumentCollection.parseString(bindings.get("model")), + bindings.get("filterName"))); + } + } + + Collections.sort( tranches, new TrancheTruthSensitivityComparator() ); + return tranches; + } catch( FileNotFoundException e ) { + throw new UserException.CouldNotReadInputFile(f, e); + } + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrancheManager.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrancheManager.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrancheManager.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrancheManager.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java new file mode 100644 index 000000000..c5e2b8183 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -0,0 +1,567 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantrecalibration; + +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.PartitionBy; +import org.broadinstitute.sting.gatk.walkers.PartitionType; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.R.RScriptExecutor; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.vcf.VCFHeader; +import org.broadinstitute.variant.vcf.VCFHeaderLine; +import org.broadinstitute.sting.utils.collections.ExpandingArrayList; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.io.Resource; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.PrintStream; +import java.util.*; + +/** + * Create a Gaussian mixture model by looking at the annotations values over a high quality subset of the input call set and then evaluate all input variants. + * + *

+ * This walker is the first pass in a two-stage processing step. This walker is designed to be used in conjunction with the ApplyRecalibration walker. + *

+ * + *

+ * The purpose of the variant recalibrator is to assign a well-calibrated probability to each variant call in a call set. + * You can then create highly accurate call sets by filtering based on this single estimate for the accuracy of each call. + * The approach taken by variant quality score recalibration is to develop a continuous, covarying estimate of the relationship + * between SNP call annotations (QD, MQ, HaplotypeScore, and ReadPosRankSum, for example) and the probability that a SNP is a true genetic + * variant versus a sequencing or data processing artifact. This model is determined adaptively based on "true sites" provided + * as input, typically HapMap 3 sites and those sites found to be polymorphic on the Omni 2.5M SNP chip array. This adaptive + * error model can then be applied to both known and novel variation discovered in the call set of interest to evaluate the + * probability that each call is real. The score that gets added to the INFO field of each variant is called the VQSLOD. It is + * the log odds ratio of being a true variant versus being false under the trained Gaussian mixture model. + *

+ * + *

Inputs

+ *

+ * The input raw variants to be recalibrated. + *

+ * Known, truth, and training sets to be used by the algorithm. How these various sets are used is described below. + * + *

Output

+ *

+ * A recalibration table file in VCF format that is used by the ApplyRecalibration walker. + *

+ * A tranches file which shows various metrics of the recalibration callset as a function of making several slices through the data. + * + *

Example

+ *
+ * java -Xmx4g -jar GenomeAnalysisTK.jar \
+ *   -T VariantRecalibrator \
+ *   -R reference/human_g1k_v37.fasta \
+ *   -input NA12878.HiSeq.WGS.bwa.cleaned.raw.subset.b37.vcf \
+ *   -resource:hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.b37.sites.vcf \
+ *   -resource:omni,known=false,training=true,truth=false,prior=12.0 1000G_omni2.5.b37.sites.vcf \
+ *   -resource:dbsnp,known=true,training=false,truth=false,prior=6.0 dbsnp_135.b37.vcf \
+ *   -an QD -an HaplotypeScore -an MQRankSum -an ReadPosRankSum -an FS -an MQ -an InbreedingCoeff \
+ *   -mode SNP \
+ *   -recalFile path/to/output.recal \
+ *   -tranchesFile path/to/output.tranches \
+ *   -rscriptFile path/to/output.plots.R
+ * 
+ * + *

Caveat

+ * + *
    + *
  • The values used in the example above are only meant to show how the command lines are composed. + * They are not meant to be taken as specific recommendations of values to use in your own work, and they may be + * different from the values cited elsewhere in our documentation. For the latest and greatest recommendations on + * how to set parameter values for you own analyses, please read the Best Practices section of the documentation.
  • + * + *
  • In order to create the model reporting plots Rscript needs to be in your environment PATH (this is the scripting version of R, not the interactive version). + * See http://www.r-project.org for more info on how to download and install R.
  • + *
+ */ + +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} ) +@PartitionBy(PartitionType.NONE) +public class VariantRecalibrator extends RodWalker, ExpandingArrayList> implements TreeReducible> { + + public static final String VQS_LOD_KEY = "VQSLOD"; // Log odds ratio of being a true variant versus being false under the trained gaussian mixture model + public static final String CULPRIT_KEY = "culprit"; // The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out + public static final String NEGATIVE_LABEL_KEY = "NEGATIVE_TRAIN_SITE"; // this variant was used in the negative training set + public static final String POSITIVE_LABEL_KEY = "POSITIVE_TRAIN_SITE"; // this variant was used in the positive training set + private static final String PLOT_TRANCHES_RSCRIPT = "plot_Tranches.R"; + + @ArgumentCollection private VariantRecalibratorArgumentCollection VRAC = new VariantRecalibratorArgumentCollection(); + + ///////////////////////////// + // Inputs + ///////////////////////////// + /** + * These calls should be unfiltered and annotated with the error covariates that are intended to be used for modeling. + */ + @Input(fullName="input", shortName = "input", doc="The raw input variants to be recalibrated", required=true) + public List> inputCollections; + final private List> input = new ArrayList<>(); + + /** + * These additional calls should be unfiltered and annotated with the error covariates that are intended to be used for modeling. + */ + @Input(fullName="aggregate", shortName = "aggregate", doc="Additional raw input variants to be used in building the model", required=false) + public List> aggregate; + + /** + * Any set of VCF files to use as lists of training, truth, or known sites. + * Training - The program builds the Gaussian mixture model using input variants that overlap with these training sites. + * Truth - The program uses these truth sites to determine where to set the cutoff in VQSLOD sensitivity. + * Known - The program only uses known sites for reporting purposes (to indicate whether variants are already known or novel). They are not used in any calculations by the algorithm itself. + * Bad - A database of known bad variants can be used to supplement the set of worst ranked variants (compared to the Gaussian mixture model) that the program selects from the data to model "bad" variants. + */ + @Input(fullName="resource", shortName = "resource", doc="A list of sites for which to apply a prior probability of being correct but which aren't used by the algorithm (training and truth sets are required to run)", required=true) + public List> resource = Collections.emptyList(); + + ///////////////////////////// + // Outputs + ///////////////////////////// + @Output(fullName="recal_file", shortName="recalFile", doc="The output recal file used by ApplyRecalibration", required=true) + protected VariantContextWriter recalWriter = null; + + @Output(fullName="tranches_file", shortName="tranchesFile", doc="The output tranches file used by ApplyRecalibration", required=true) + protected File TRANCHES_FILE; + + ///////////////////////////// + // Additional Command Line Arguments + ///////////////////////////// + /** + * The expected transition / transversion ratio of true novel variants in your targeted region (whole genome, exome, specific + * genes), which varies greatly by the CpG and GC content of the region. See expected Ti/Tv ratios section of the GATK best + * practices documentation (http://www.broadinstitute.org/gatk/guide/best-practices) for more information. + * Normal values are 2.15 for human whole genome values and 3.2 for human whole exomes. Note + * that this parameter is used for display purposes only and isn't used anywhere in the algorithm! + */ + @Argument(fullName="target_titv", shortName="titv", doc="The expected novel Ti/Tv ratio to use when calculating FDR tranches and for display on the optimization curve output figures. (approx 2.15 for whole genome experiments). ONLY USED FOR PLOTTING PURPOSES!", required=false) + protected double TARGET_TITV = 2.15; + + /** + * See the input VCF file's INFO field for a list of all available annotations. + */ + @Argument(fullName="use_annotation", shortName="an", doc="The names of the annotations which should used for calculations", required=true) + private String[] USE_ANNOTATIONS = null; + + /** + * Add truth sensitivity slices through the call set at the given values. The default values are 100.0, 99.9, 99.0, and 90.0 + * which will result in 4 estimated tranches in the final call set: the full set of calls (100% sensitivity at the accessible + * sites in the truth set), a 99.9% truth sensitivity tranche, along with progressively smaller tranches at 99% and 90%. + */ + @Argument(fullName="TStranche", shortName="tranche", doc="The levels of novel false discovery rate (FDR, implied by ti/tv) at which to slice the data. (in percent, that is 1.0 for 1 percent)", required=false) + private double[] TS_TRANCHES = new double[] {100.0, 99.9, 99.0, 90.0}; + /** + * For this to work properly, the -ignoreFilter argument should also be applied to the ApplyRecalibration command. + */ + @Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified, the variant recalibrator will also use variants marked as filtered by the specified filter name in the input VCF file", required=false) + private String[] IGNORE_INPUT_FILTERS = null; + @Output(fullName="rscript_file", shortName="rscriptFile", doc="The output rscript file generated by the VQSR to aid in visualization of the input data and learned model", required=false, defaultToStdout=false) + private File RSCRIPT_FILE = null; + + @Hidden + @Argument(fullName="replicate", shortName="replicate", doc="Used to debug the random number generation inside the VQSR. Do not use.", required=false) + protected int REPLICATE = 200; + private ArrayList replicate = new ArrayList<>(); + + ///////////////////////////// + // Debug Arguments + ///////////////////////////// + @Advanced + @Argument(fullName = "trustAllPolymorphic", shortName = "allPoly", doc = "Trust that all the input training sets' unfiltered records contain only polymorphic sites to drastically speed up the computation.", required = false) + protected Boolean TRUST_ALL_POLYMORPHIC = false; + + ///////////////////////////// + // Private Member Variables + ///////////////////////////// + private VariantDataManager dataManager; + private PrintStream tranchesStream; + private final Set ignoreInputFilterSet = new TreeSet<>(); + private final VariantRecalibratorEngine engine = new VariantRecalibratorEngine( VRAC ); + + //--------------------------------------------------------------------------------------------------------------- + // + // initialize + // + //--------------------------------------------------------------------------------------------------------------- + + @Override + public void initialize() { + dataManager = new VariantDataManager( new ArrayList<>(Arrays.asList(USE_ANNOTATIONS)), VRAC ); + + if (RSCRIPT_FILE != null && !RScriptExecutor.RSCRIPT_EXISTS) + Utils.warnUser(logger, String.format( + "Rscript not found in environment path. %s will be generated but PDF plots will not.", + RSCRIPT_FILE)); + + if( IGNORE_INPUT_FILTERS != null ) { + ignoreInputFilterSet.addAll( Arrays.asList(IGNORE_INPUT_FILTERS) ); + } + + try { + tranchesStream = new PrintStream(TRANCHES_FILE); + } catch (FileNotFoundException e) { + throw new UserException.CouldNotCreateOutputFile(TRANCHES_FILE, e); + } + + for( RodBinding rod : resource ) { + dataManager.addTrainingSet( new TrainingSet( rod ) ); + } + + if( !dataManager.checkHasTrainingSet() ) { + throw new UserException.CommandLineException( "No training set found! Please provide sets of known polymorphic loci marked with the training=true ROD binding tag. For example, -resource:hapmap,VCF,known=false,training=true,truth=true,prior=12.0 hapmapFile.vcf" ); + } + if( !dataManager.checkHasTruthSet() ) { + throw new UserException.CommandLineException( "No truth set found! Please provide sets of known polymorphic loci marked with the truth=true ROD binding tag. For example, -resource:hapmap,VCF,known=false,training=true,truth=true,prior=12.0 hapmapFile.vcf" ); + } + + final Set hInfo = new HashSet<>(); + ApplyRecalibration.addVQSRStandardHeaderLines(hInfo); + recalWriter.writeHeader( new VCFHeader(hInfo) ); + + for( int iii = 0; iii < REPLICATE * 2; iii++ ) { + replicate.add(GenomeAnalysisEngine.getRandomGenerator().nextDouble()); + } + + // collect the actual rod bindings into a list for use later + for ( final RodBindingCollection inputCollection : inputCollections ) + input.addAll(inputCollection.getRodBindings()); + } + + //--------------------------------------------------------------------------------------------------------------- + // + // map + // + //--------------------------------------------------------------------------------------------------------------- + + @Override + public ExpandingArrayList map( final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context ) { + final ExpandingArrayList mapList = new ExpandingArrayList<>(); + + if( tracker == null ) { // For some reason RodWalkers get map calls with null trackers + return mapList; + } + + mapList.addAll( addOverlappingVariants(input, true, tracker, context) ); + if( aggregate != null ) { + mapList.addAll( addOverlappingVariants(aggregate, false, tracker, context) ); + } + + return mapList; + } + + /** + * Using the RefMetaDataTracker find overlapping variants and pull out the necessary information to create the VariantDatum + * @param rods the rods to search within + * @param isInput is this rod an -input rod? + * @param tracker the RefMetaDataTracker from the RODWalker map call + * @param context the AlignmentContext from the RODWalker map call + * @return a list of VariantDatums, can be empty + */ + private List addOverlappingVariants( final List> rods, final boolean isInput, final RefMetaDataTracker tracker, final AlignmentContext context ) { + if( rods == null ) { throw new IllegalArgumentException("rods cannot be null."); } + if( tracker == null ) { throw new IllegalArgumentException("tracker cannot be null."); } + if( context == null ) { throw new IllegalArgumentException("context cannot be null."); } + + final ExpandingArrayList variants = new ExpandingArrayList<>(); + + for( final VariantContext vc : tracker.getValues(rods, context.getLocation()) ) { + if( vc != null && ( vc.isNotFiltered() || ignoreInputFilterSet.containsAll(vc.getFilters()) ) ) { + if( VariantDataManager.checkVariationClass( vc, VRAC.MODE ) ) { + final VariantDatum datum = new VariantDatum(); + + // Populate the datum with lots of fields from the VariantContext, unfortunately the VC is too big so we just pull in only the things we absolutely need. + dataManager.decodeAnnotations( datum, vc, true ); //BUGBUG: when run with HierarchicalMicroScheduler this is non-deterministic because order of calls depends on load of machine + datum.loc = ( isInput ? getToolkit().getGenomeLocParser().createGenomeLoc(vc) : null ); + datum.originalQual = vc.getPhredScaledQual(); + datum.isSNP = vc.isSNP() && vc.isBiallelic(); + datum.isTransition = datum.isSNP && GATKVariantContextUtils.isTransition(vc); + datum.isAggregate = !isInput; + + // Loop through the training data sets and if they overlap this loci then update the prior and training status appropriately + dataManager.parseTrainingSets( tracker, context.getLocation(), vc, datum, TRUST_ALL_POLYMORPHIC ); + final double priorFactor = QualityUtils.qualToProb( datum.prior ); + datum.prior = Math.log10( priorFactor ) - Math.log10( 1.0 - priorFactor ); + + variants.add( datum ); + } + } + } + + return variants; + } + + //--------------------------------------------------------------------------------------------------------------- + // + // reduce + // + //--------------------------------------------------------------------------------------------------------------- + + @Override + public ExpandingArrayList reduceInit() { + return new ExpandingArrayList<>(); + } + + @Override + public ExpandingArrayList reduce( final ExpandingArrayList mapValue, final ExpandingArrayList reduceSum ) { + reduceSum.addAll( mapValue ); + return reduceSum; + } + + @Override + public ExpandingArrayList treeReduce( final ExpandingArrayList lhs, final ExpandingArrayList rhs ) { + rhs.addAll( lhs ); + return rhs; + } + + //--------------------------------------------------------------------------------------------------------------- + // + // on traversal done + // + //--------------------------------------------------------------------------------------------------------------- + + @Override + public void onTraversalDone( final ExpandingArrayList reduceSum ) { + dataManager.setData( reduceSum ); + dataManager.normalizeData(); // Each data point is now (x - mean) / standard deviation + + // Generate the positive model using the training data and evaluate each variant + final List positiveTrainingData = dataManager.getTrainingData(); + final GaussianMixtureModel goodModel = engine.generateModel( positiveTrainingData, VRAC.MAX_GAUSSIANS ); + engine.evaluateData( dataManager.getData(), goodModel, false ); + + // Generate the negative model using the worst performing data and evaluate each variant contrastively + final List negativeTrainingData = dataManager.selectWorstVariants(); + final GaussianMixtureModel badModel = engine.generateModel( negativeTrainingData, Math.min(VRAC.MAX_GAUSSIANS_FOR_NEGATIVE_MODEL, VRAC.MAX_GAUSSIANS)); + dataManager.dropAggregateData(); // Don't need the aggregate data anymore so let's free up the memory + engine.evaluateData( dataManager.getData(), badModel, true ); + + if( badModel.failedToConverge || goodModel.failedToConverge ) { + throw new UserException("NaN LOD value assigned. Clustering with this few variants and these annotations is unsafe. Please consider " + (badModel.failedToConverge ? "raising the number of variants used to train the negative model (via --minNumBadVariants 5000, for example)." : "lowering the maximum number of Gaussians allowed for use in the model (via --maxGaussians 4, for example).") ); + } + + engine.calculateWorstPerformingAnnotation( dataManager.getData(), goodModel, badModel ); + + // Find the VQSLOD cutoff values which correspond to the various tranches of calls requested by the user + final int nCallsAtTruth = TrancheManager.countCallsAtTruth( dataManager.getData(), Double.NEGATIVE_INFINITY ); + final TrancheManager.SelectionMetric metric = new TrancheManager.TruthSensitivityMetric( nCallsAtTruth ); + final List tranches = TrancheManager.findTranches( dataManager.getData(), TS_TRANCHES, metric, VRAC.MODE ); + tranchesStream.print(Tranche.tranchesString( tranches )); + + logger.info( "Writing out recalibration table..." ); + dataManager.writeOutRecalibrationTable( recalWriter ); + if( RSCRIPT_FILE != null ) { + logger.info( "Writing out visualization Rscript file..."); + createVisualizationScript( dataManager.getRandomDataForPlotting( 1000, positiveTrainingData, negativeTrainingData, dataManager.getEvaluationData() ), goodModel, badModel, 0.0, dataManager.getAnnotationKeys().toArray(new String[USE_ANNOTATIONS.length]) ); + } + + if(VRAC.MODE == VariantRecalibratorArgumentCollection.Mode.INDEL) { + // Print out an info message to make it clear why the tranches plot is not generated + logger.info("Tranches plot will not be generated since we are running in INDEL mode"); + } else { + // Execute the RScript command to plot the table of truth values + RScriptExecutor executor = new RScriptExecutor(); + executor.addScript(new Resource(PLOT_TRANCHES_RSCRIPT, VariantRecalibrator.class)); + executor.addArgs(TRANCHES_FILE.getAbsoluteFile(), TARGET_TITV); + // Print out the command line to make it clear to the user what is being executed and how one might modify it + logger.info("Executing: " + executor.getApproximateCommandLine()); + executor.exec(); + } + } + + private void createVisualizationScript( final List randomData, final GaussianMixtureModel goodModel, final GaussianMixtureModel badModel, final double lodCutoff, final String[] annotationKeys ) { + PrintStream stream; + try { + stream = new PrintStream(RSCRIPT_FILE); + } catch( FileNotFoundException e ) { + throw new UserException.CouldNotCreateOutputFile(RSCRIPT_FILE, e); + } + + // We make extensive use of the ggplot2 R library: http://had.co.nz/ggplot2/ + stream.println("library(ggplot2)"); + // For compactPDF in R 2.13+ + stream.println("library(tools)"); + // For graphical functions R 2.14.2+ + stream.println("library(grid)"); + + createArrangeFunction( stream ); + + stream.println("outputPDF <- \"" + RSCRIPT_FILE + ".pdf\""); + stream.println("pdf(outputPDF)"); // Unfortunately this is a huge pdf file, BUGBUG: need to work on reducing the file size + + for(int iii = 0; iii < annotationKeys.length; iii++) { + for( int jjj = iii + 1; jjj < annotationKeys.length; jjj++) { + logger.info( "Building " + annotationKeys[iii] + " x " + annotationKeys[jjj] + " plot..."); + + final List fakeData = new ExpandingArrayList<>(); + double minAnn1 = 100.0, maxAnn1 = -100.0, minAnn2 = 100.0, maxAnn2 = -100.0; + for( final VariantDatum datum : randomData ) { + minAnn1 = Math.min(minAnn1, datum.annotations[iii]); + maxAnn1 = Math.max(maxAnn1, datum.annotations[iii]); + minAnn2 = Math.min(minAnn2, datum.annotations[jjj]); + maxAnn2 = Math.max(maxAnn2, datum.annotations[jjj]); + } + // Create a fake set of data which spans the full extent of these two annotation dimensions in order to calculate the model PDF projected to 2D + final double NUM_STEPS = 60.0; + for(double ann1 = minAnn1; ann1 <= maxAnn1; ann1+= (maxAnn1 - minAnn1) / NUM_STEPS) { + for(double ann2 = minAnn2; ann2 <= maxAnn2; ann2+= (maxAnn2 - minAnn2) / NUM_STEPS) { + final VariantDatum datum = new VariantDatum(); + datum.prior = 0.0; + datum.annotations = new double[randomData.get(0).annotations.length]; + datum.isNull = new boolean[randomData.get(0).annotations.length]; + for(int ann=0; ann< datum.annotations.length; ann++) { + datum.annotations[ann] = 0.0; + datum.isNull[ann] = true; + } + datum.annotations[iii] = ann1; + datum.annotations[jjj] = ann2; + datum.isNull[iii] = false; + datum.isNull[jjj] = false; + fakeData.add(datum); + } + } + + engine.evaluateData( fakeData, goodModel, false ); + engine.evaluateData( fakeData, badModel, true ); + + stream.print("surface <- c("); + for( final VariantDatum datum : fakeData ) { + stream.print(String.format("%.4f, %.4f, %.4f, ", + dataManager.denormalizeDatum(datum.annotations[iii], iii), + dataManager.denormalizeDatum(datum.annotations[jjj], jjj), + Math.min(4.0, Math.max(-4.0, datum.lod)))); + } + stream.println("NA,NA,NA)"); + stream.println("s <- matrix(surface,ncol=3,byrow=T)"); + + stream.print("data <- c("); + for( final VariantDatum datum : randomData ) { + stream.print(String.format("%.4f, %.4f, %.4f, %d, %d,", + dataManager.denormalizeDatum(datum.annotations[iii], iii), + dataManager.denormalizeDatum(datum.annotations[jjj], jjj), + (datum.lod < lodCutoff ? -1.0 : 1.0), + (datum.atAntiTrainingSite ? -1 : (datum.atTrainingSite ? 1 : 0)), (datum.isKnown ? 1 : -1))); + } + stream.println("NA,NA,NA,NA,1)"); + stream.println("d <- matrix(data,ncol=5,byrow=T)"); + + final String surfaceFrame = "sf." + annotationKeys[iii] + "." + annotationKeys[jjj]; + final String dataFrame = "df." + annotationKeys[iii] + "." + annotationKeys[jjj]; + + stream.println(surfaceFrame + " <- data.frame(x=s[,1], y=s[,2], lod=s[,3])"); + stream.println(dataFrame + " <- data.frame(x=d[,1], y=d[,2], retained=d[,3], training=d[,4], novelty=d[,5])"); + stream.println("dummyData <- " + dataFrame + "[1,]"); + stream.println("dummyData$x <- NaN"); + stream.println("dummyData$y <- NaN"); + stream.println("p <- ggplot(data=" + surfaceFrame + ", aes(x=x, y=y)) + opts(panel.background = theme_rect(colour = NA), panel.grid.minor = theme_line(colour = NA), panel.grid.major = theme_line(colour = NA))"); + stream.println("p1 = p + opts(title=\"model PDF\") + labs(x=\""+ annotationKeys[iii] +"\", y=\""+ annotationKeys[jjj] +"\") + geom_tile(aes(fill = lod)) + scale_fill_gradient(high=\"green\", low=\"red\")"); + stream.println("p <- qplot(x,y,data=" + dataFrame + ", color=retained, alpha=I(1/7),legend=FALSE) + opts(panel.background = theme_rect(colour = NA), panel.grid.minor = theme_line(colour = NA), panel.grid.major = theme_line(colour = NA))"); + stream.println("q <- geom_point(aes(x=x,y=y,color=retained),data=dummyData, alpha=1.0, na.rm=TRUE)"); + stream.println("p2 = p + q + labs(x=\""+ annotationKeys[iii] +"\", y=\""+ annotationKeys[jjj] +"\") + scale_colour_gradient(name=\"outcome\", high=\"black\", low=\"red\",breaks=c(-1,1),labels=c(\"filtered\",\"retained\"))"); + stream.println("p <- qplot(x,y,data="+ dataFrame + "["+dataFrame+"$training != 0,], color=training, alpha=I(1/7)) + opts(panel.background = theme_rect(colour = NA), panel.grid.minor = theme_line(colour = NA), panel.grid.major = theme_line(colour = NA))"); + stream.println("q <- geom_point(aes(x=x,y=y,color=training),data=dummyData, alpha=1.0, na.rm=TRUE)"); + stream.println("p3 = p + q + labs(x=\""+ annotationKeys[iii] +"\", y=\""+ annotationKeys[jjj] +"\") + scale_colour_gradient(high=\"green\", low=\"purple\",breaks=c(-1,1), labels=c(\"neg\", \"pos\"))"); + stream.println("p <- qplot(x,y,data=" + dataFrame + ", color=novelty, alpha=I(1/7)) + opts(panel.background = theme_rect(colour = NA), panel.grid.minor = theme_line(colour = NA), panel.grid.major = theme_line(colour = NA))"); + stream.println("q <- geom_point(aes(x=x,y=y,color=novelty),data=dummyData, alpha=1.0, na.rm=TRUE)"); + stream.println("p4 = p + q + labs(x=\""+ annotationKeys[iii] +"\", y=\""+ annotationKeys[jjj] +"\") + scale_colour_gradient(name=\"novelty\", high=\"blue\", low=\"red\",breaks=c(-1,1), labels=c(\"novel\",\"known\"))"); + stream.println("arrange(p1, p2, p3, p4, ncol=2)"); + } + } + stream.println("dev.off()"); + + stream.println("if (exists(\"compactPDF\")) {"); + stream.println("compactPDF(outputPDF)"); + stream.println("}"); + + stream.close(); + + // Execute Rscript command to generate the clustering plots + RScriptExecutor executor = new RScriptExecutor(); + executor.addScript(RSCRIPT_FILE); + logger.info("Executing: " + executor.getApproximateCommandLine()); + executor.exec(); + } + + // The Arrange function is how we place the 4 model plots on one page + // from http://gettinggeneticsdone.blogspot.com/2010/03/arrange-multiple-ggplot2-plots-in-same.html + private void createArrangeFunction( final PrintStream stream ) { + stream.println("vp.layout <- function(x, y) viewport(layout.pos.row=x, layout.pos.col=y)"); + stream.println("arrange <- function(..., nrow=NULL, ncol=NULL, as.table=FALSE) {"); + stream.println("dots <- list(...)"); + stream.println("n <- length(dots)"); + stream.println("if(is.null(nrow) & is.null(ncol)) { nrow = floor(n/2) ; ncol = ceiling(n/nrow)}"); + stream.println("if(is.null(nrow)) { nrow = ceiling(n/ncol)}"); + stream.println("if(is.null(ncol)) { ncol = ceiling(n/nrow)}"); + stream.println("grid.newpage()"); + stream.println("pushViewport(viewport(layout=grid.layout(nrow,ncol) ) )"); + stream.println("ii.p <- 1"); + stream.println("for(ii.row in seq(1, nrow)){"); + stream.println("ii.table.row <- ii.row "); + stream.println("if(as.table) {ii.table.row <- nrow - ii.table.row + 1}"); + stream.println("for(ii.col in seq(1, ncol)){"); + stream.println("ii.table <- ii.p"); + stream.println("if(ii.p > n) break"); + stream.println("print(dots[[ii.table]], vp=vp.layout(ii.table.row, ii.col))"); + stream.println("ii.p <- ii.p + 1"); + stream.println("}"); + stream.println("}"); + stream.println("}"); + } +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorArgumentCollection.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorArgumentCollection.java new file mode 100644 index 000000000..81067e695 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorArgumentCollection.java @@ -0,0 +1,136 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantrecalibration; + +import org.broadinstitute.sting.commandline.Advanced; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Hidden; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Mar 4, 2011 + */ + +public class VariantRecalibratorArgumentCollection { + + public enum Mode { + SNP, + INDEL, + BOTH + } + + static Mode parseString(final String input) { + if( input.equals("SNP") ) { return Mode.SNP; } + if( input.equals("INDEL") ) { return Mode.INDEL; } + if( input.equals("BOTH") ) { return Mode.BOTH; } + throw new ReviewedStingException("VariantRecalibrator mode string is unrecognized, input = " + input); + } + + @Argument(fullName = "mode", shortName = "mode", doc = "Recalibration mode to employ: 1.) SNP for recalibrating only SNPs (emitting indels untouched in the output VCF); 2.) INDEL for indels (emitting SNPs untouched in the output VCF); and 3.) BOTH for recalibrating both SNPs and indels simultaneously (for testing purposes only, not recommended for general use).", required = false) + public VariantRecalibratorArgumentCollection.Mode MODE = VariantRecalibratorArgumentCollection.Mode.SNP; + + @Advanced + @Argument(fullName="maxGaussians", shortName="mG", doc="The maximum number of Gaussians for the positive model to try during variational Bayes algorithm.", required=false) + public int MAX_GAUSSIANS = 8; + + @Advanced + @Argument(fullName="maxNegativeGaussians", shortName="mNG", doc="The maximum number of Gaussians for the negative model to try during variational Bayes algorithm. The actual maximum used is the min of the mG and mNG arguments. Note that this number should be small (like 4) to achieve the best results", required=false) + public int MAX_GAUSSIANS_FOR_NEGATIVE_MODEL = 2; + + @Advanced + @Argument(fullName="maxIterations", shortName="mI", doc="The maximum number of VBEM iterations to be performed in variational Bayes algorithm. Procedure will normally end when convergence is detected.", required=false) + public int MAX_ITERATIONS = 150; + + @Advanced + @Argument(fullName="numKMeans", shortName="nKM", doc="The number of k-means iterations to perform in order to initialize the means of the Gaussians in the Gaussian mixture model.", required=false) + public int NUM_KMEANS_ITERATIONS = 100; + + @Advanced + @Argument(fullName="stdThreshold", shortName="std", doc="If a variant has annotations more than -std standard deviations away from mean then don't use it for building the Gaussian mixture model.", required=false) + public double STD_THRESHOLD = 10.0; + + @Advanced + @Argument(fullName="shrinkage", shortName="shrinkage", doc="The shrinkage parameter in the variational Bayes algorithm.", required=false) + public double SHRINKAGE = 1.0; + + @Advanced + @Argument(fullName="dirichlet", shortName="dirichlet", doc="The dirichlet parameter in the variational Bayes algorithm.", required=false) + public double DIRICHLET_PARAMETER = 0.001; + + @Advanced + @Argument(fullName="priorCounts", shortName="priorCounts", doc="The number of prior counts to use in the variational Bayes algorithm.", required=false) + public double PRIOR_COUNTS = 20.0; + + @Advanced + @Argument(fullName="maxNumTrainingData", shortName="maxNumTrainingData", doc="Maximum number of training data to be used in building the Gaussian mixture model. Training sets large than this will be randomly downsampled.", required=false) + protected int MAX_NUM_TRAINING_DATA = 2500000; + + @Advanced + @Argument(fullName="minNumBadVariants", shortName="minNumBad", doc="The minimum number of worst scoring variants to use when building the Gaussian mixture model of bad variants.", required=false) + public int MIN_NUM_BAD_VARIANTS = 1000; + + @Advanced + @Argument(fullName="badLodCutoff", shortName="badLodCutoff", doc="The LOD score below which to be used when building the Gaussian mixture model of bad variants.", required=false) + public double BAD_LOD_CUTOFF = -5.0; + + ///////////////////////////// + // Deprecated Arguments + // Keeping them here is meant to provide users with error messages that are more informative than "arg not defined" when they use an argument that has been put out of service + ///////////////////////////// + + @Hidden + @Deprecated + @Argument(fullName="percentBadVariants", shortName="percentBad", doc="This argument is no longer used in GATK versions 2.7 and newer. Please see the online documentation for the latest usage recommendations.", required=false) + public double PERCENT_BAD_VARIANTS = 0.03; + + @Hidden + @Deprecated + @Argument(fullName="numBadVariants", shortName="numBad", doc="This argument is no longer used in GATK versions 2.8 and newer. Please see the online documentation for the latest usage recommendations.", required=false) + public int NUM_BAD_VARIANTS = 1000; +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CalculateGenotypePosteriors.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/CalculateGenotypePosteriors.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CalculateGenotypePosteriors.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/CalculateGenotypePosteriors.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/CombineGVCFs.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/CombineGVCFs.java new file mode 100644 index 000000000..0f577cb23 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/CombineGVCFs.java @@ -0,0 +1,326 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.Reference; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.Window; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.*; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; +import org.broadinstitute.variant.vcf.*; + +import java.util.*; + +/** + * Combines any number of gVCF files that were produced by the Haplotype Caller into a single joint gVCF file. + * + *

+ * CombineGVCFs is meant to be used for hierarchical merging of gVCFs that will eventually be input into GenotypeGVCFs. + * One would use this tool when needing to genotype too large a number of individual gVCFs; instead of passing them + * all in to GenotypeGVCFs, one would first use CombineGVCFs on smaller batches of samples and then pass these combined + * gVCFs to GenotypeGVCFs. + * + * Note that this tool cannot work with just any gVCF files - they must have been produced with the Haplotype Caller + * as part of the "single sample discovery" pipeline using the '-ERC GVCF' mode, which uses a sophisticated reference + * model to produce accurate genotype likelihoods for every position in the target. + * + *

Input

+ *

+ * One or more Haplotype Caller gVCFs to combine. + *

+ * + *

Output

+ *

+ * A combined VCF. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T CombineGVCFs \
+ *   --variant gvcf1.vcf \
+ *   --variant gvcf2.vcf \
+ *   -o mergeGvcf.vcf
+ * 
+ * + */ +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) +@Reference(window=@Window(start=0,stop=1)) +public class CombineGVCFs extends RodWalker { + + protected final class PositionalState { + final List VCs; + final byte[] refBases; + final GenomeLoc loc; + public PositionalState(final List VCs, final byte[] refBases, final GenomeLoc loc) { + this.VCs = VCs; + this.refBases = refBases; + this.loc = loc; + } + } + + protected final class OverallState { + final LinkedList VCs = new LinkedList<>(); + GenomeLoc prevPos = null; + byte refAfterPrevPos; + + public OverallState() {} + } + + /** + * The gVCF files to merge together + */ + @Input(fullName="variant", shortName = "V", doc="One or more input gVCF files", required=true) + public List> variantCollections; + final private List> variants = new ArrayList<>(); + + @Output(doc="File to which the combined gVCF should be written") + protected VariantContextWriter vcfWriter = null; + + private GenomeLocParser genomeLocParser; + + public void initialize() { + // take care of the VCF headers + final Map vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit()); + final Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true); + + final Set samples = SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); + final VCFHeader vcfHeader = new VCFHeader(headerLines, samples); + vcfWriter.writeHeader(vcfHeader); + + // collect the actual rod bindings into a list for use later + for ( final RodBindingCollection variantCollection : variantCollections ) + variants.addAll(variantCollection.getRodBindings()); + + genomeLocParser = getToolkit().getGenomeLocParser(); + } + + public PositionalState map(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { + if ( tracker == null ) // RodWalkers can make funky map calls + return null; + + final GenomeLoc loc = ref.getLocus(); + return new PositionalState(tracker.getValues(variants, loc), ref.getBases(), loc); + } + + public OverallState reduceInit() { + return new OverallState(); + } + + public OverallState reduce(final PositionalState startingStates, final OverallState previousState) { + if ( startingStates == null ) + return previousState; + + final int currentPos = startingStates.loc.getStart(); + + if ( !startingStates.VCs.isEmpty() ) { + if ( ! okayToSkipThisSite(currentPos, previousState.prevPos) ) + endPreviousStates(previousState, currentPos - 1, startingStates.refBases[0]); + previousState.VCs.addAll(startingStates.VCs); + } + + if ( containsEndingContext(previousState.VCs, currentPos) ) { + endPreviousStates(previousState, currentPos, startingStates.refBases.length > 1 ? startingStates.refBases[1] : (byte)'N'); + } + + return previousState; + } + + /** + * Is it okay to skip the given position? + * + * @param thisPos this position + * @param lastPosRun the last position for which we created a VariantContext + * @return true if it is okay to skip this position, false otherwise + */ + private boolean okayToSkipThisSite(final int thisPos, final GenomeLoc lastPosRun) { + return lastPosRun != null && thisPos == lastPosRun.getStart() + 1; + } + + /** + * Does the given list of VariantContexts contain any whose context ends at the given position? + * + * @param VCs list of VariantContexts + * @param pos the position to check against + * @return true if there are one or more VCs that end at pos, false otherwise + */ + private boolean containsEndingContext(final List VCs, final int pos) { + if ( VCs == null ) throw new IllegalArgumentException("The list of VariantContexts cannot be null"); + + for ( final VariantContext vc : VCs ) { + if ( isEndingContext(vc, pos) ) + return true; + } + return false; + } + + /** + * Does the given variant context end (in terms of reference blocks, not necessarily formally) at the given position. + * Note that for the purposes of this method/tool, deletions are considered to be single base events (as opposed to + * reference blocks), hence the check for the number of alleles (because we know there will always be a allele). + * + * @param vc the variant context + * @param pos the position to query against + * @return true if this variant context "ends" at this position, false otherwise + */ + private boolean isEndingContext(final VariantContext vc, final int pos) { + return vc.getNAlleles() > 2 || vc.getEnd() == pos; + } + + /** + * Disrupt the VariantContexts so that they all stop at the given pos, write them out, and put the remainder back in the list. + * + * @param state the state with list of VariantContexts + * @param pos the target ending position + * @param refBase the reference base to use at the position AFTER pos + */ + private void endPreviousStates(final OverallState state, final int pos, final byte refBase) { + + final List stoppedVCs = new ArrayList<>(state.VCs.size()); + + for ( int i = state.VCs.size() - 1; i >= 0; i-- ) { + final VariantContext vc = state.VCs.get(i); + if ( vc.getStart() <= pos ) { + + stoppedVCs.add(vc); + + // if it was ending anyways, then remove it from the future state + if ( isEndingContext(vc, pos) ) + state.VCs.remove(i); + } + } + + if ( !stoppedVCs.isEmpty() ) { + final GenomeLoc gLoc = genomeLocParser.createGenomeLoc(stoppedVCs.get(0).getChr(), pos); + + // we need the specialized merge if the site contains anything other than ref blocks + final VariantContext mergedVC; + if ( containsTrueAltAllele(stoppedVCs) ) + mergedVC = GATKVariantContextUtils.referenceConfidenceMerge(stoppedVCs, gLoc, refBase, false); + else + mergedVC = referenceBlockMerge(stoppedVCs, state, pos); + + vcfWriter.add(mergedVC); + state.prevPos = gLoc; + state.refAfterPrevPos = refBase; + } + } + + /** + * Combine a list of reference block VariantContexts. + * We can't use GATKVariantContextUtils.simpleMerge() because it is just too slow for this sort of thing. + * + * @param VCs the variant contexts to merge + * @param state the state object + * @param end the end of this block (inclusive) + * @return a new merged VariantContext + */ + private VariantContext referenceBlockMerge(final List VCs, final OverallState state, final int end) { + + final VariantContext first = VCs.get(0); + + // ref allele and start + final Allele refAllele; + final int start; + if ( state.prevPos == null || !state.prevPos.getContig().equals(first.getChr()) || first.getStart() >= state.prevPos.getStart() + 1) { + start = first.getStart(); + refAllele = first.getReference(); + } else { + start = state.prevPos.getStart() + 1; + refAllele = Allele.create(state.refAfterPrevPos, true); + } + + // attributes + final Map attrs = new HashMap<>(1); + attrs.put(VCFConstants.END_KEY, Integer.toString(end)); + + // genotypes + final GenotypesContext genotypes = GenotypesContext.create(); + for ( final VariantContext vc : VCs ) { + for ( final Genotype g : vc.getGenotypes() ) + genotypes.add(new GenotypeBuilder(g).alleles(Arrays.asList(refAllele, refAllele)).make()); + } + + return new VariantContextBuilder("", first.getChr(), start, end, Arrays.asList(refAllele, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE)).attributes(attrs).genotypes(genotypes).make(); + } + + /** + * Does the given list of VariantContexts contain any with an alternate allele other than ? + * + * @param VCs list of VariantContexts + * @return true if there are one or more VCs that contain a true alternate allele, false otherwise + */ + private boolean containsTrueAltAllele(final List VCs) { + if ( VCs == null ) throw new IllegalArgumentException("The list of VariantContexts cannot be null"); + + for ( final VariantContext vc : VCs ) { + if ( vc.getNAlleles() > 2 ) + return true; + } + return false; + } + + @Override + public void onTraversalDone(final OverallState state) { + // there shouldn't be any state left unless the user cut in the middle of a gVCF block + if ( !state.VCs.isEmpty() ) + logger.warn("You have asked for an interval that cuts in the middle of one or more gVCF blocks. Please note that this will cause you to lose records that don't end within your interval."); + } +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeGVCFs.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeGVCFs.java new file mode 100644 index 000000000..a6d151df8 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeGVCFs.java @@ -0,0 +1,276 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.Reference; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.gatk.walkers.Window; +import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.*; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; +import org.broadinstitute.variant.vcf.*; + +import java.util.*; + +/** + * Genotypes any number of gVCF files that were produced by the Haplotype Caller into a single joint VCF file. + * + *

+ * GenotypeGVCFs merges gVCF records that were produced as part of the "single sample discovery" pipeline using + * the '-ERC GVCF' mode of the Haplotype Caller. This tool performs the multi-sample joint aggregation + * step and merges the records together in a sophisticated manner. + * + * At all positions of the target, this tool will combine all spanning records, produce correct genotype likelihoods, + * re-genotype the newly merged record, and then re-annotate it. + * + * Note that this tool cannot work with just any gVCF files - they must have been produced with the Haplotype Caller, + * which uses a sophisticated reference model to produce accurate genotype likelihoods for every position in the target. + * + *

Input

+ *

+ * One or more Haplotype Caller gVCFs to genotype. + *

+ * + *

Output

+ *

+ * A combined, genotyped VCF. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T GenotypeGVCFs \
+ *   --variant gvcf1.vcf \
+ *   --variant gvcf2.vcf \
+ *   -o output.vcf
+ * 
+ * + */ +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) +@Reference(window=@Window(start=-10,stop=10)) +public class GenotypeGVCFs extends RodWalker implements AnnotatorCompatible, TreeReducible { + + /** + * The gVCF files to merge together + */ + @Input(fullName="variant", shortName = "V", doc="One or more input gVCF files", required=true) + public List> variantCollections; + final private List> variants = new ArrayList<>(); + + @Output(doc="File to which variants should be written") + protected VariantContextWriter vcfWriter = null; + + // TODO -- currently this option doesn't actually work; must fix + @Argument(fullName="includeNonVariants", shortName="inv", doc="Include loci found to be non-variant after the combining procedure", required=false) + public boolean INCLUDE_NON_VARIANTS = false; + + /** + * Which annotations to recompute for the combined output VCF file. + */ + @Advanced + @Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to recompute", required=false) + protected List annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"InbreedingCoeff", "FisherStrand", "QualByDepth", "ChromosomeCounts"})); + + /** + * rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate. + * dbSNP is not used in any way for the calculations themselves. + */ + @ArgumentCollection + protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); + public RodBinding getDbsnpRodBinding() { return dbsnp.dbsnp; } + + // the genotyping engine + private UnifiedGenotyperEngine genotypingEngine; + // the annotation engine + private VariantAnnotatorEngine annotationEngine; + + public List> getCompRodBindings() { return Collections.emptyList(); } + public RodBinding getSnpEffRodBinding() { return null; } + public List> getResourceRodBindings() { return Collections.emptyList(); } + public boolean alwaysAppendDbsnpId() { return false; } + + + public void initialize() { + // create the annotation engine + annotationEngine = new VariantAnnotatorEngine(Arrays.asList("none"), annotationsToUse, Collections.emptyList(), this, getToolkit()); + + // take care of the VCF headers + final Map vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit()); + final Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true); + headerLines.addAll(annotationEngine.getVCFAnnotationDescriptions()); + VCFStandardHeaderLines.addStandardInfoLines(headerLines, true, VCFConstants.MLE_ALLELE_COUNT_KEY, VCFConstants.MLE_ALLELE_FREQUENCY_KEY); + if ( dbsnp != null && dbsnp.dbsnp.isBound() ) + VCFStandardHeaderLines.addStandardInfoLines(headerLines, true, VCFConstants.DBSNP_KEY); + + final Set samples = SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); + final VCFHeader vcfHeader = new VCFHeader(headerLines, samples); + vcfWriter.writeHeader(vcfHeader); + + // create the genotyping engine + genotypingEngine = new UnifiedGenotyperEngine(getToolkit(), new UnifiedArgumentCollection(), logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY); + + // collect the actual rod bindings into a list for use later + for ( final RodBindingCollection variantCollection : variantCollections ) + variants.addAll(variantCollection.getRodBindings()); + } + + public VariantContext map(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { + if ( tracker == null ) // RodWalkers can make funky map calls + return null; + + final GenomeLoc loc = ref.getLocus(); + final VariantContext combinedVC = GATKVariantContextUtils.referenceConfidenceMerge(tracker.getPrioritizedValue(variants, loc), loc, INCLUDE_NON_VARIANTS ? ref.getBase() : null, true); + if ( combinedVC == null ) + return null; + + return regenotypeVC(tracker, ref, combinedVC); + } + + /** + * Re-genotype (and re-annotate) a combined genomic VC + * + * @param tracker the ref tracker + * @param ref the ref context + * @param originalVC the combined genomic VC + * @return a new VariantContext or null if the site turned monomorphic and we don't want such sites + */ + protected VariantContext regenotypeVC(final RefMetaDataTracker tracker, final ReferenceContext ref, final VariantContext originalVC) { + if ( originalVC == null ) throw new IllegalArgumentException("originalVC cannot be null"); + + VariantContext result = originalVC; + + // only re-genotype polymorphic sites + if ( result.isVariant() ) { + VariantContext regenotypedVC = genotypingEngine.calculateGenotypes(result); + if ( regenotypedVC == null ) + return null; + + regenotypedVC = GATKVariantContextUtils.reverseTrimAlleles(regenotypedVC); + + // we want to carry forward the attributes from the original VC but make sure to add the MLE-based annotations + final Map attrs = new HashMap<>(originalVC.getAttributes()); + attrs.put(VCFConstants.MLE_ALLELE_COUNT_KEY, regenotypedVC.getAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY)); + attrs.put(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, regenotypedVC.getAttribute(VCFConstants.MLE_ALLELE_FREQUENCY_KEY)); + + result = new VariantContextBuilder(regenotypedVC).attributes(attrs).make(); + } + + // if it turned monomorphic and we don't want such sites, quit + if ( !INCLUDE_NON_VARIANTS && result.isMonomorphicInSamples() ) + return null; + + // re-annotate it + result = annotationEngine.annotateContext(tracker, ref, null, result); + + // fix some of the annotations + return new VariantContextBuilder(result).genotypes(cleanupGenotypeAnnotations(result.getGenotypes())).make(); + } + + /** + * Cleans up genotype-level annotations that need to be updated. + * 1. move MIN_DP to DP if present + * 2. remove SB is present + * + * @param newGs the new Genotypes to fix + * @return a new set of Genotypes + */ + private List cleanupGenotypeAnnotations(final GenotypesContext newGs) { + final List recoveredGs = new ArrayList<>(newGs.size()); + for ( final Genotype newG : newGs ) { + final Map attrs = new HashMap<>(newG.getExtendedAttributes()); + + final GenotypeBuilder builder = new GenotypeBuilder(newG); + + // move the MIN_DP to DP + if ( newG.hasExtendedAttribute("MIN_DP") ) { + builder.DP(newG.getAttributeAsInt("MIN_DP", 0)); + attrs.remove("MIN_DP"); + } + + // remove SB + attrs.remove("SB"); + + recoveredGs.add(builder.noAttributes().attributes(attrs).make()); + } + return recoveredGs; + } + + public VariantContextWriter reduceInit() { + return vcfWriter; + } + + public VariantContextWriter reduce(final VariantContext vc, final VariantContextWriter writer) { + if ( vc != null ) + writer.add(vc); + return writer; + } + + @Override + public VariantContextWriter treeReduce(final VariantContextWriter lhs, final VariantContextWriter rhs) { + return lhs; + } + + @Override + public void onTraversalDone(final VariantContextWriter writer) {} +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/PosteriorLikelihoodsUtils.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/PosteriorLikelihoodsUtils.java new file mode 100644 index 000000000..6f56415f7 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/PosteriorLikelihoodsUtils.java @@ -0,0 +1,263 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.*; +import org.broadinstitute.variant.vcf.VCFConstants; + +import java.util.*; + +public class PosteriorLikelihoodsUtils { + + public static VariantContext calculatePosteriorGLs(final VariantContext vc1, + final Collection resources, + final int numRefSamplesFromMissingResources, + final double globalFrequencyPriorDirichlet, + final boolean useInputSamples, + final boolean useEM, + final boolean useAC) { + if ( useEM ) + throw new IllegalArgumentException("EM loop for posterior GLs not yet implemented"); + + final Map totalAlleleCounts = new HashMap<>(); + for ( final VariantContext resource : resources ) { + addAlleleCounts(totalAlleleCounts,resource,useAC); + } + + if ( useInputSamples ) { + addAlleleCounts(totalAlleleCounts,vc1,useAC); + } + + totalAlleleCounts.put(vc1.getReference(),totalAlleleCounts.get(vc1.getReference())+numRefSamplesFromMissingResources); + + // now extract the counts of the alleles present within vc1, and in order + final double[] alleleCounts = new double[vc1.getNAlleles()]; + int alleleIndex = 0; + for ( final Allele allele : vc1.getAlleles() ) { + + alleleCounts[alleleIndex++] = globalFrequencyPriorDirichlet + ( totalAlleleCounts.containsKey(allele) ? + totalAlleleCounts.get(allele) : 0 ); + } + + final List likelihoods = new ArrayList<>(vc1.getNSamples()); + for ( final Genotype genotype : vc1.getGenotypes() ) { + likelihoods.add(genotype.hasLikelihoods() ? genotype.getLikelihoods().getAsVector() : null ); + } + + final List posteriors = calculatePosteriorGLs(likelihoods,alleleCounts,vc1.getMaxPloidy(2)); + + final GenotypesContext newContext = GenotypesContext.create(); + for ( int genoIdx = 0; genoIdx < vc1.getNSamples(); genoIdx ++ ) { + final GenotypeBuilder builder = new GenotypeBuilder(vc1.getGenotype(genoIdx)); + if ( posteriors.get(genoIdx) != null ) { + GATKVariantContextUtils.updateGenotypeAfterSubsetting(vc1.getAlleles(), builder, + GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, posteriors.get(genoIdx), vc1.getAlleles()); + builder.attribute(VCFConstants.GENOTYPE_POSTERIORS_KEY, + Utils.listFromPrimitives(GenotypeLikelihoods.fromLog10Likelihoods(posteriors.get(genoIdx)).getAsPLs())); + } + newContext.add(builder.make()); + } + + final List priors = Utils.listFromPrimitives( + GenotypeLikelihoods.fromLog10Likelihoods(getDirichletPrior(alleleCounts, vc1.getMaxPloidy(2))).getAsPLs()); + + final VariantContextBuilder builder = new VariantContextBuilder(vc1).genotypes(newContext).attribute("PG", priors); + // add in the AC, AF, and AN attributes + VariantContextUtils.calculateChromosomeCounts(builder, true); + return builder.make(); + } + + /** + * Given genotype likelihoods and known allele counts, calculate the posterior likelihoods + * over the genotype states + * @param genotypeLikelihoods - the genotype likelihoods for the individual + * @param knownAlleleCountsByAllele - the known allele counts in the population. For AC=2 AN=12 site, this is {10,2} + * @param ploidy - the ploidy to assume + * @return - the posterior genotype likelihoods + */ + protected static List calculatePosteriorGLs(final List genotypeLikelihoods, + final double[] knownAlleleCountsByAllele, + final int ploidy) { + if ( ploidy != 2 ) { + throw new IllegalStateException("Genotype posteriors not yet implemented for ploidy != 2"); + } + + final double[] genotypePriorByAllele = getDirichletPrior(knownAlleleCountsByAllele,ploidy); + final List posteriors = new ArrayList<>(genotypeLikelihoods.size()); + for ( final double[] likelihoods : genotypeLikelihoods ) { + double[] posteriorProbabilities = null; + + if ( likelihoods != null ) { + if ( likelihoods.length != genotypePriorByAllele.length ) { + throw new IllegalStateException(String.format("Likelihoods not of correct size: expected %d, observed %d", + knownAlleleCountsByAllele.length*(knownAlleleCountsByAllele.length+1)/2,likelihoods.length)); + } + + posteriorProbabilities = new double[genotypePriorByAllele.length]; + for ( int genoIdx = 0; genoIdx < likelihoods.length; genoIdx ++ ) { + posteriorProbabilities[genoIdx] = likelihoods[genoIdx] + genotypePriorByAllele[genoIdx]; + } + + posteriorProbabilities = MathUtils.normalizeFromLog10(posteriorProbabilities, true); + + } + + posteriors.add(posteriorProbabilities); + } + + return posteriors; + } + + // convenience function for a single genotypelikelihoods array. Just wraps. + protected static double[] calculatePosteriorGLs(final double[] genotypeLikelihoods, + final double[] knownAlleleCountsByAllele, + final int ploidy) { + return calculatePosteriorGLs(Arrays.asList(genotypeLikelihoods),knownAlleleCountsByAllele,ploidy).get(0); + } + + + /** + * Given known allele counts (whether external, from the sample, or both), calculate the prior distribution + * over genotype states. This assumes + * 1) Random sampling of alleles (known counts are unbiased, and frequency estimate is Dirichlet) + * 2) Genotype states are independent (Hardy-Weinberg) + * These assumptions give rise to a Dirichlet-Multinomial distribution of genotype states as a prior + * (the "number of trials" for the multinomial is simply the ploidy) + * @param knownCountsByAllele - the known counts per allele. For an AC=2, AN=12 site this is {10,2} + * @param ploidy - the number of chromosomes in the sample. For now restricted to 2. + * @return - the Dirichlet-Multinomial distribution over genotype states + */ + protected static double[] getDirichletPrior(final double[] knownCountsByAllele, final int ploidy) { + if ( ploidy != 2 ) { + throw new IllegalStateException("Genotype priors not yet implemented for ploidy != 2"); + } + + // multi-allelic format is + // AA AB BB AC BC CC AD BD CD DD ... + final double sumOfKnownCounts = MathUtils.sum(knownCountsByAllele); + final double[] priors = new double[knownCountsByAllele.length*(knownCountsByAllele.length+1)/2]; + int priorIndex = 0; + for ( int allele2 = 0; allele2 < knownCountsByAllele.length; allele2++ ) { + for ( int allele1 = 0; allele1 <= allele2; allele1++) { + final int[] counts = new int[knownCountsByAllele.length]; + counts[allele1] += 1; + counts[allele2] += 1; + priors[priorIndex++] = MathUtils.dirichletMultinomial(knownCountsByAllele,sumOfKnownCounts,counts,ploidy); + } + } + + return priors; + } + + private static void addAlleleCounts(final Map counts, final VariantContext context, final boolean useAC) { + final int[] ac; + if ( context.hasAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY) && ! useAC ) { + ac = extractInts(context.getAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY)); + } else if ( context.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) { + ac = extractInts(context.getAttribute(VCFConstants.ALLELE_COUNT_KEY)); + } else { + ac = new int[context.getAlternateAlleles().size()]; + int idx = 0; + for ( final Allele allele : context.getAlternateAlleles() ) { + ac[idx++] = context.getCalledChrCount(allele); + } + } + + for ( final Allele allele : context.getAlleles() ) { + final int count; + if ( allele.isReference() ) { + if ( context.hasAttribute(VCFConstants.ALLELE_NUMBER_KEY) ) { + count = context.getAttributeAsInt(VCFConstants.ALLELE_NUMBER_KEY,-1) - (int) MathUtils.sum(ac); + } else { + count = context.getCalledChrCount() - (int) MathUtils.sum(ac); + } + } else { + count = ac[context.getAlternateAlleles().indexOf(allele)]; + } + if ( ! counts.containsKey(allele) ) { + counts.put(allele,0); + } + counts.put(allele,count + counts.get(allele)); + } + } + + public static int[] extractInts(final Object integerListContainingVCField) { + List mleList = null; + if ( integerListContainingVCField instanceof List ) { + if ( ((List) integerListContainingVCField).get(0) instanceof String ) { + mleList = new ArrayList<>(((List) integerListContainingVCField).size()); + for ( Object s : ((List)integerListContainingVCField)) { + mleList.add(Integer.parseInt((String) s)); + } + } else { + mleList = (List) integerListContainingVCField; + } + } else if ( integerListContainingVCField instanceof Integer ) { + mleList = Arrays.asList((Integer) integerListContainingVCField); + } else if ( integerListContainingVCField instanceof String ) { + mleList = Arrays.asList(Integer.parseInt((String)integerListContainingVCField)); + } + if ( mleList == null ) + throw new IllegalArgumentException(String.format("VCF does not have properly formatted "+ + VCFConstants.MLE_ALLELE_COUNT_KEY+" or "+VCFConstants.ALLELE_COUNT_KEY)); + + final int[] mle = new int[mleList.size()]; + + if ( ! ( mleList.get(0) instanceof Integer ) ) { + throw new IllegalStateException("BUG: The AC values should be an Integer, but was "+mleList.get(0).getClass().getCanonicalName()); + } + + for ( int idx = 0; idx < mle.length; idx++) { + mle[idx] = mleList.get(idx); + } + + return mle; + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RegenotypeVariants.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/RegenotypeVariants.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RegenotypeVariants.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/RegenotypeVariants.java diff --git a/protected/java/src/org/broadinstitute/sting/package-info.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/package-info.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/package-info.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/package-info.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/SequenceComplexity.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/SequenceComplexity.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/SequenceComplexity.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/SequenceComplexity.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/collections/CountSet.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/collections/CountSet.java new file mode 100644 index 000000000..5c7dbd505 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/collections/CountSet.java @@ -0,0 +1,516 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.utils.collections; + +import com.google.java.contract.Requires; + +import java.lang.reflect.Array; +import java.util.*; + +/** + * Efficient implementation for a small set of integer primitive values. + *

+ * It includes a increment operation incAll which is convenient when analyzing the read-threading graphs. Nevertheless + * it can be also be used in general purpose. + *

+ *

+ * It does not provide a O(1) look-up of its elements though. These are kept in a sorted array so look up is implemented + * using a binary search O(log n). Therefore it might not be optimal for problems that require large integer sets. + *

+ *

+ * Also note that addition can be costly for large sets unless done in order: O(n). + *

+ * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class CountSet implements Cloneable, Set { + + /** + * The size of the set. + */ + private int size; + + /** + * Holds the element of the set within the subrange [0 .. size - 1] in ascending order. + */ + private int[] elements; + + /** + * Creates a copy of an existing int-set. + * @param template the intset to copy values from. + */ + public CountSet(final CountSet template) { + elements = template.elements.clone(); + size = template.size; + } + + /** + * Creates a new set indicating the expected maximum number of elements it will contain. + * @param initialCapacity the desired initial capacity of the set. + * @throws IllegalArgumentException if initialCapacity is negative. + */ + public CountSet(int initialCapacity) { + if (initialCapacity < 0) + throw new IllegalArgumentException(); + elements = new int[initialCapacity]; + size = 0; + } + + /** + * Set the set contents to a single integer value. + * @param value the integer value to set the set to. + */ + public void setTo(int value) { + ensureCapacity(1); + size = 1; + elements[0] = value; + } + + /** + * Set the content of this set to a collection of integers. + * @param values the new values to be included in the set. + * @throws NullPointerException if value is null. + */ + public void setTo(int ... values) { + ensureCapacity(values.length); + size = values.length; + System.arraycopy(values, 0, elements, 0, size); + Arrays.sort(elements,0,size); + } + + /** + * Increase (or decrease) all elements in the set by a number. + * @param delta the number of add (or substract if negative) to all elements. + * + * @return true if the set changed as a result of this invocation, false otherwise. + */ + public boolean incAll(final int delta) { + if (size == 0 || delta == 0) + return false; + for (int i = 0; i < size; i++) + elements[i] += delta; + return true; + } + + /** + * Returns the smallest integer value in the set. + * + * @throws NoSuchElementException if the set is empty (thus there is no minimum). + * @return the smallest integer value in the set. + */ + public int min() { + if (size == 0) + throw new NoSuchElementException("cannot have a min from an empty set"); + return elements[0]; + } + + /** + * Returns the largest integer value in the set. + * + * @throws NoSuchElementException if the set is empty (thus there is no maximum). + * @return the largest integer value in the set. + */ + public int max() { + if (size == 0) + throw new NoSuchElementException("cannot have a max from an empty set"); + return elements[size - 1]; + } + + /** + * Adds a range of integer values to the collection. + * + * This method avoid the need to explicity indicate all values in that range. Notice that the range is fully inclusive. + * You can indicate a decrease range (fromValue > toValue). + * + * @param fromValue the first value to add in the set (inclusive). + * @param toValue the last value to add to the set (inclusive). + * @return true if the set changed as a result of this invocation, false otherwise. + */ + public boolean addRange(final int fromValue, final int toValue) { + final int lowEnd; + final int highEnd; + + if (fromValue <= toValue) { + lowEnd = fromValue; highEnd = toValue; + } else { + highEnd = fromValue; lowEnd = toValue; + } + + //TODO to be optimized to add missing sub-ranges in one go: + boolean result = false; + for (int i = lowEnd; i <= highEnd; i++) + result = add(i) | result; + return result; + } + + /** + * Add an integer value to the set. + * @param value to add to the set. + * @return true if the set changed as a result of this invocation, false otherwise. + */ + public boolean add(final int value) { + int pos = Arrays.binarySearch(elements,0,size,value); + if (pos >= 0) return false; + int insertPos = - pos - 1; + ensureCapacity(size + 1); + System.arraycopy(elements, insertPos, elements, insertPos + 1, size - insertPos); + elements[insertPos] = value; + size++; + return true; + } + + /** + * Add a arbitrary number of integers to the set. + * + * @param values integer to add to the set. + * @return true if the set changed as a result of this invocation, false otherwise. + */ + public boolean addAll(final int ... values) { + ensureCapacity(size + values.length); + boolean result = false; + for (final int v : values) + result = add(v) | result; + return result; + } + + @Override + public boolean addAll(final Collection numbers) { + ensureCapacity(size + numbers.size()); + boolean result = false; + for (final Number n : numbers) + result = add(n.intValue()) | result; + return result; + } + + /** + * Add all values within a range in an integer array. + * + * @param source array where the values to add are found. + * @param fromIndex first position from source to add (inclusive). + * @param toIndex index after the last position in source to add (thus exclusive). + * @throws NullPointerException if source is null. + * @throws NegativeArraySizeException if fromIndex or toIndex are negative. + * @throws ArrayIndexOutOfBoundsException if fromIndex or toIndex are beyond bounds + * allowed [0 .. source.length]. + * @return true if the set changed as a result of this invocation, false otherwise. + */ + public boolean addAll(final int[] source, final int fromIndex, final int toIndex) { + ensureCapacity(size + source.length); + boolean result = false; + for (int i = fromIndex; i < toIndex; i++) + result = add(source[i]) | result; + return result; + } + + + /** + * Add all elements present in a int-set. + * + * @param other the other inset. + * + * @throws NullPointerException if other is null. + * @return true if this set changed due to this operation, false otherwise. + */ + public boolean addAll(final CountSet other) { + return addAll(other.elements,0,other.size); + } + + /** + * Checks whether a integer value is included in the set. + * @param value the value to check. + * @return true if value is inside the set, false otherwise. + */ + public boolean contains(final int value) { + return Arrays.binarySearch(elements, 0, size, value) >= 0; + } + + /** + * Make sure that this int-set has capacity to handle a number of elements. + *

+ * If the set has already that or greater capacity nothing would be changed. + * + * @param capacity the requested capacity. + */ + private void ensureCapacity(final int capacity) { + if (elements.length >= capacity) return; + int newLength = Math.max(elements.length << 1, capacity); + elements = Arrays.copyOf(elements,newLength); + } + + + @Override + public int size() { + return size; + } + + @Override + public boolean isEmpty() { + return size() == 0; + } + + @Override + public boolean contains(final Object o) { + if (o instanceof Integer) { + final int i = (Integer)o; + return contains(i); + } else + return false; //To change body of implemented methods use File | Settings | File Templates. + } + + + @Override + public Iterator iterator() { + return new MyIterator(); + } + + @Override + public Object[] toArray() { + final Integer[] result = new Integer[size]; + for (int i = 0; i < size; i++) + result[i] = elements[i]; + return result; + } + + @Override + @SuppressWarnings("unchecked") + public T[] toArray(final T[] a) { + if (a == null) + throw new NullPointerException(); + + @SuppressWarnings("unchecked") + final Class componentClass = (Class) a.getClass().getComponentType(); + if (!componentClass.isAssignableFrom(Integer.class)) + throw new ArrayStoreException(); + + @SuppressWarnings("unchecked") + final T[] dest = (a.length < size) ? (T[]) Array.newInstance(componentClass, size) : a; + + for (int i = 0; i < size; i++) + dest[i] = (T) (Integer) elements[i]; + return dest; + } + + /** + * Copies the content of the set into an integer array. The result can be freely modified by the invoker. + * @return never null but a zero-length array if the set is empty. + */ + public int[] toIntArray() { + return Arrays.copyOfRange(elements,0,size); + } + + /** + * Copy the content of the set into an array. + * @param dest the destination array. + * @param offset where to store the first element of the set. + * @throws NullPointerException if dest is null. + * @throws ArrayIndexOutOfBoundsException if offset is out of range of there is not enough + * space after offset in the destination array to hold all values in the set. + */ + public void copyTo(final int[] dest, int offset) { + if (dest == null) + throw new NullPointerException(); + if (dest.length < (size + offset)) + throw new ArrayIndexOutOfBoundsException("destination is to short"); + System.arraycopy(elements,0,dest,offset,size); + } + + /** + * Copy the content of the set into an array. + * @param dest the destination array. + * @throws NullPointerException if dest is null. + * @throws ArrayIndexOutOfBoundsException if there is not enough + * space after offset in the destination array to hold all values in the set. + */ + public void copyTo(final int[] dest) { + copyTo(dest,0); + } + + + @Override + public boolean add(final Integer integer) { + return add((int) integer); + } + + @Override + public boolean remove(final Object o) { + return o instanceof Integer && remove((int)o); + } + + /** + * Removes a single integer value for the set. + * @param i the value to remove. + * @return true if the set has changed as a result of this invocation, false otherwise. + */ + public boolean remove(final int i) { + final int pos = Arrays.binarySearch(elements,0,size,i); + if (pos < 0) + return false; + else { + removeIndex(pos); + return true; + } + } + + @Override + public boolean containsAll(final Collection c) { + for (final Object o : c) + if (!contains(o)) + return false; + return true; + } + + + @Override + public boolean retainAll(final Collection c) { + if (size == 0) + return false; + @SuppressWarnings("all") + final CountSet retainIndices = new CountSet(c.size() + 2); + retainIndices.add(-1); + retainIndices.add(size); + for (final Object o : c) { + if (!(o instanceof Integer)) + continue; + final int pos = Arrays.binarySearch(elements,0,size,(int) o); + if (pos < 0) + continue; + retainIndices.add(pos); + } + if (retainIndices.size == 2) { + size = 0; + return true; + } else if (retainIndices.size == size + 2) { + return false; + } else { + for (int idx = retainIndices.size - 1; idx > 0; idx--) { + final int toIdx = retainIndices.elements[idx]; + final int fromIdx = retainIndices.elements[idx - 1] + 1; + removeIndices(toIdx,fromIdx); + } + return true; + } + } + + /** + * Removes the values found in a range of indexes in {@link #elements}. + * @param fromIdx first index to remove (inclusive). + * @param toIdx right after last index to remove (exclusive). + */ + @Requires("fromIdx >= toIdx & fromIdx >= 0 & toIdx <= size") + private void removeIndices(final int fromIdx, final int toIdx) { + System.arraycopy(elements,toIdx,elements,fromIdx,size - toIdx); + size -= toIdx - fromIdx; + } + + @Override + public boolean removeAll(final Collection c) { + boolean result = false; + for (final Object o : c) + result = remove(o) | result; + return result; + } + + @Requires("idx >= 0 && idx < size") + private void removeIndex(int idx) { + System.arraycopy(elements,idx+1,elements,idx,size - idx - 1); + } + + @Override + public void clear() { + size = 0; + } + + /** + * Returns a copy of this set which can be changed without modifying the original one. + * @return never {@code null}. + */ + @SuppressWarnings("all") + public CountSet clone() { + return new CountSet(this); + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder(2 + size() * 10); + sb.append('{'); + for (int i = 0; i < size; i++) + sb.append(elements[i]).append(','); + sb.replace(sb.length()-1,sb.length(),"}"); + return sb.toString(); + + } + + + /** + * Custom iterator class for {@link CountSet IntSets} + */ + private class MyIterator implements Iterator { + /** What position I am in. */ + private int next = 0; + + @Override + public boolean hasNext() { + return next < size; + } + + @Override + public Integer next() { + if (next >= size) + throw new NoSuchElementException(); + return elements[next]; + } + + @Override + public void remove() { + if (next == 0) + throw new IllegalStateException(); + if (next >= size) + throw new NoSuchElementException(); + removeIndex(next - 1); + } + } + + +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/gvcf/GVCFWriter.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/gvcf/GVCFWriter.java new file mode 100644 index 000000000..71d61c920 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/gvcf/GVCFWriter.java @@ -0,0 +1,318 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.gvcf; + +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.Genotype; +import org.broadinstitute.variant.variantcontext.GenotypeBuilder; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.VariantContextBuilder; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; +import org.broadinstitute.variant.vcf.*; + +import java.util.Collections; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; + +/** + * Genome-wide VCF writer + * + * User: depristo + * Date: 6/24/13 + * Time: 2:51 PM + */ +public class GVCFWriter implements VariantContextWriter { + // + // static VCF field names + // + protected final static String BLOCK_SIZE_INFO_FIELD = "BLOCK_SIZE"; + protected final static String MIN_DP_FORMAT_FIELD = "MIN_DP"; + protected final static String MIN_GQ_FORMAT_FIELD = "MIN_GQ"; + + // + // Final fields initialized in constructor + // + /** Where we'll ultimately write our VCF records */ + final private VariantContextWriter underlyingWriter; + + final private List GQPartitions; + + /** fields updated on the fly during GVCFWriter operation */ + int nextAvailableStart = -1; + String contigOfNextAvailableStart = null; + private String sampleName = null; + private HomRefBlock currentBlock = null; + + /** + * Is the proposed GQ partitions well-formed? + * + * @param GQPartitions proposed GQ partitions + * @return a non-null string if something is wrong (string explains issue) + */ + protected static List parsePartitions(final List GQPartitions) { + if ( GQPartitions == null ) throw new IllegalArgumentException("GQpartitions cannot be null"); + if ( GQPartitions.isEmpty() ) throw new IllegalArgumentException("GQpartitions cannot be empty"); + + final List result = new LinkedList<>(); + int lastThreshold = 0; + for ( final Integer value : GQPartitions ) { + if ( value == null ) throw new IllegalArgumentException("GQPartitions contains a null integer"); + if ( value < lastThreshold ) throw new IllegalArgumentException("GQPartitions is out of order. Last is " + lastThreshold + " but next is " + value); + if ( value == lastThreshold ) throw new IllegalArgumentException("GQPartitions is equal elements: Last is " + lastThreshold + " but next is " + value); + result.add(new HomRefBlock(lastThreshold, value)); + lastThreshold = value; + } + result.add(new HomRefBlock(lastThreshold, Integer.MAX_VALUE)); + + return result; + } + + /** + * Create a new GVCF writer + * + * Should be a non-empty list of boundaries. For example, suppose this variable is + * + * [A, B, C] + * + * We would partition our hom-ref sites into the following bands: + * + * X < A + * A <= X < B + * B <= X < C + * X >= C + * + * @param underlyingWriter the ultimate destination of the GVCF records + * @param GQPartitions a well-formed list of GQ partitions + */ + public GVCFWriter(final VariantContextWriter underlyingWriter, final List GQPartitions) { + if ( underlyingWriter == null ) throw new IllegalArgumentException("underlyingWriter cannot be null"); + this.underlyingWriter = underlyingWriter; + this.GQPartitions = parsePartitions(GQPartitions); + } + + /** + * Write the VCF header + * + * Adds standard GVCF fields to the header + * + * @param header a non-null header + */ + @Override + public void writeHeader(VCFHeader header) { + if ( header == null ) throw new IllegalArgumentException("header cannot be null"); + header.addMetaDataLine(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY)); + header.addMetaDataLine(new VCFFormatHeaderLine(MIN_DP_FORMAT_FIELD, 1, VCFHeaderLineType.Integer, "Minimum DP observed within the GVCF block")); + + // These annotations are no longer standard + //header.addMetaDataLine(new VCFInfoHeaderLine(BLOCK_SIZE_INFO_FIELD, 1, VCFHeaderLineType.Integer, "Size of the homozygous reference GVCF block")); + //header.addMetaDataLine(new VCFFormatHeaderLine(MIN_GQ_FORMAT_FIELD, 1, VCFHeaderLineType.Integer, "Minimum GQ observed within the GVCF block")); + + for ( final HomRefBlock partition : GQPartitions ) { + header.addMetaDataLine(partition.toVCFHeaderLine()); + } + + underlyingWriter.writeHeader(header); + } + + /** + * Close this GVCF writer. Finalizes any pending hom-ref blocks and emits those to the underlyingWriter as well + */ + @Override + public void close() { + close(true); + } + + /** + * Horrible work around because there's no clean way to get our VCFWriter closed by the GATK + * + * If closeUnderlyingWriter is true, then we'll close the underlying writer, otherwise we'll leave it open + * so the GATK closes it later + * + * @param closeUnderlyingWriter should we leave the underlying writer open or closed? + */ + public void close(final boolean closeUnderlyingWriter) { + emitCurrentBlock(); + if ( closeUnderlyingWriter ) underlyingWriter.close(); + } + + /** + * Add hom-ref site from vc to this gVCF hom-ref state tracking, emitting any pending states if appropriate + * + * @param vc a non-null VariantContext + * @param g a non-null genotype from VariantContext + * @return a VariantContext to be emitted, or null if non is appropriate + */ + protected VariantContext addHomRefSite(final VariantContext vc, final Genotype g) { + if ( nextAvailableStart != -1 ) { + // don't create blocks while the hom-ref site falls before nextAvailableStart (for deletions) + if ( vc.getStart() <= nextAvailableStart && vc.getChr().equals(contigOfNextAvailableStart) ) { + return null; + } + // otherwise, reset to non-relevant + nextAvailableStart = -1; + contigOfNextAvailableStart = null; + } + + if ( currentBlock == null ) { + currentBlock = createNewBlock(vc, g); + return null; + } else if ( currentBlock.withinBounds(g.getGQ()) ) { + currentBlock.add(vc.getStart(), g); + return null; + } else { + final VariantContext result = blockToVCF(currentBlock); + currentBlock = createNewBlock(vc, g); + return result; + } + } + + /** + * Flush the current hom-ref block, if necessary, to the underlying writer, and reset the currentBlock to null + */ + private void emitCurrentBlock() { + if ( currentBlock != null ) { + // there's actually some work to do + underlyingWriter.add(blockToVCF(currentBlock)); + currentBlock = null; + } + } + + /** + * Convert a HomRefBlock into a VariantContext + * + * @param block the block to convert + * @return a VariantContext representing the gVCF encoding for this block + */ + private VariantContext blockToVCF(final HomRefBlock block) { + if ( block == null ) throw new IllegalArgumentException("block cannot be null"); + + final VariantContextBuilder vcb = new VariantContextBuilder(block.getStartingVC()); + vcb.attributes(new HashMap(2)); // clear the attributes + vcb.stop(block.getStop()); + vcb.attribute(VCFConstants.END_KEY, block.getStop()); + + // This annotation is no longer standard + //vcb.attribute(BLOCK_SIZE_INFO_FIELD, block.getSize()); + + // create the single Genotype with GQ and DP annotations + final GenotypeBuilder gb = new GenotypeBuilder(sampleName, Collections.nCopies(2, block.getRef())); + gb.noAD().noPL().noAttributes(); // clear all attributes + gb.GQ(block.getMedianGQ()); + gb.DP(block.getMedianDP()); + gb.attribute(MIN_DP_FORMAT_FIELD, block.getMinDP()); + gb.PL(block.getMinPLs()); + + // This annotation is no longer standard + //gb.attribute(MIN_GQ_FORMAT_FIELD, block.getMinGQ()); + + return vcb.genotypes(gb.make()).make(); + } + + /** + * Helper function to create a new HomRefBlock from a variant context and current genotype + * + * @param vc the VariantContext at the site where want to start the band + * @param g the genotype of the sample from vc that should be used to initialize the block + * @return a newly allocated and initialized block containing g already + */ + private HomRefBlock createNewBlock(final VariantContext vc, final Genotype g) { + // figure out the GQ limits to use based on the GQ of g + HomRefBlock partition = null; + for ( final HomRefBlock maybePartition : GQPartitions ) { + if ( maybePartition.withinBounds(g.getGQ()) ) { + partition = maybePartition; + break; + } + } + if ( partition == null ) throw new IllegalStateException("GQ " + g + " from " + vc + " didn't fit into any partition " + partition); + + // create the block, add g to it, and return it for use + final HomRefBlock block = new HomRefBlock(vc, partition.getGQLowerBound(), partition.getGQUpperBound()); + block.add(vc.getStart(), g); + return block; + } + + /** + * Add a VariantContext to this writer for emission + * + * Requires that the VC have exactly one genotype + * + * @param vc a non-null VariantContext + */ + @Override + public void add(VariantContext vc) { + if ( vc == null ) throw new IllegalArgumentException("vc cannot be null"); + + if ( sampleName == null ) + sampleName = vc.getGenotype(0).getSampleName(); + + if ( ! vc.hasGenotypes() ) { + throw new IllegalArgumentException("GVCF assumes that the VariantContext has genotypes"); + } else if ( vc.getGenotypes().size() != 1 ) { + throw new IllegalArgumentException("GVCF assumes that the VariantContext has exactly one genotype but saw " + vc.getGenotypes().size()); + } else { + if ( currentBlock != null && ! currentBlock.isContiguous(vc) ) { + // we've made a non-contiguous step (across interval, onto another chr), so finalize + emitCurrentBlock(); + } + + final Genotype g = vc.getGenotype(0); + if ( g.isHomRef() && vc.hasAlternateAllele(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE) && vc.isBiallelic() ) { + // create bands + final VariantContext maybeCompletedBand = addHomRefSite(vc, g); + if ( maybeCompletedBand != null ) underlyingWriter.add(maybeCompletedBand); + } else { + // g is variant, so flush the bands and emit vc + emitCurrentBlock(); + nextAvailableStart = vc.getEnd(); + contigOfNextAvailableStart = vc.getChr(); + underlyingWriter.add(vc); + } + } + } +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/gvcf/HomRefBlock.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/gvcf/HomRefBlock.java new file mode 100644 index 000000000..9d14fca26 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/gvcf/HomRefBlock.java @@ -0,0 +1,185 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.gvcf; + +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.Genotype; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.vcf.VCFHeaderLine; + +import java.util.ArrayList; +import java.util.List; + +/** + * Helper class for calculating a GQ band in the GVCF writer + * + * A band contains GQ and DP values for a contiguous stretch of hom-ref genotypes, + * and provides summary information about the entire block of genotypes. + * + * Genotypes within the HomRefBlock are restricted to hom-ref genotypes within a band of GQ scores + * + * User: depristo + * Date: 6/25/13 + * Time: 9:41 AM + */ +final class HomRefBlock { + private final VariantContext startingVC; + private int stop; + private final int minGQ, maxGQ; + private int[] minPLs = null; + final private List GQs = new ArrayList<>(100); + final private List DPs = new ArrayList<>(100); + private final Allele ref; + + /** + * Create a new HomRefBlock + * + * @param startingVC the VariantContext that starts this band (for starting position information) + * @param minGQ the minGQ (inclusive) to use in this band + * @param maxGQ the maxGQ (exclusive) to use in this band + */ + public HomRefBlock(final VariantContext startingVC, int minGQ, int maxGQ) { + if ( startingVC == null ) throw new IllegalArgumentException("startingVC cannot be null"); + if ( minGQ > maxGQ ) throw new IllegalArgumentException("bad minGQ " + minGQ + " as its > maxGQ " + maxGQ); + + this.startingVC = startingVC; + this.stop = getStart() - 1; + this.ref = startingVC.getReference(); + this.minGQ = minGQ; + this.maxGQ = maxGQ; + } + + /** + * Create a new HomRefBlock only for doing bounds checking + * + * @param minGQ the minGQ (inclusive) to use in this band + * @param maxGQ the maxGQ (exclusive) to use in this band + */ + public HomRefBlock(int minGQ, int maxGQ) { + if ( minGQ > maxGQ ) throw new IllegalArgumentException("bad minGQ " + minGQ + " as its > maxGQ " + maxGQ); + + this.startingVC = null; + this.stop = -1; + this.ref = null; + this.minGQ = minGQ; + this.maxGQ = maxGQ; + } + + /** + * Add information from this Genotype to this band + * @param g a non-null Genotype with GQ and DP attributes + */ + public void add(final int pos, final Genotype g) { + if ( g == null ) throw new IllegalArgumentException("g cannot be null"); + if ( ! g.hasGQ() ) throw new IllegalArgumentException("g must have GQ field"); + if ( ! g.hasPL() ) throw new IllegalArgumentException("g must have PL field"); + if ( pos != stop + 1 ) throw new IllegalArgumentException("adding genotype at pos " + pos + " isn't contiguous with previous stop " + stop); + + if( minPLs == null ) { // if the minPLs vector has not been set yet, create it here by copying the provided genotype's PLs + final int[] PL = g.getPL(); + if( PL.length == 3 ) { + minPLs = PL.clone(); + } + } else { // otherwise take the min with the provided genotype's PLs + final int[] PL = g.getPL(); + if( PL.length == 3 ) { + minPLs[0] = Math.min(minPLs[0], PL[0]); + minPLs[1] = Math.min(minPLs[1], PL[1]); + minPLs[2] = Math.min(minPLs[2], PL[2]); + } + } + stop = pos; + GQs.add(Math.min(g.getGQ(), 99)); // cap the GQs by the max. of 99 emission + DPs.add(Math.max(g.getDP(),0)); + } + + /** + * Is the GQ value within the bounds of this GQ (GQ >= minGQ && GQ < maxGQ) + * @param GQ the GQ value to test + * @return true if within bounds, false otherwise + */ + public boolean withinBounds(final int GQ) { + return GQ >= minGQ && GQ < maxGQ; + } + + /** Get the min GQ observed within this band */ + public int getMinGQ() { return MathUtils.arrayMin(GQs); } + /** Get the median GQ observed within this band */ + public int getMedianGQ() { return MathUtils.median(GQs); } + /** Get the min DP observed within this band */ + public int getMinDP() { return MathUtils.arrayMin(DPs); } + /** Get the median DP observed within this band */ + public int getMedianDP() { return MathUtils.median(DPs); } + /** Get the min PLs observed within this band, can be null if no PLs have yet been observed */ + public int[] getMinPLs() { return minPLs; } + + protected int getGQUpperBound() { return maxGQ; } + protected int getGQLowerBound() { return minGQ; } + + public boolean isContiguous(final VariantContext vc) { + return vc.getEnd() == getStop() + 1 && startingVC.getChr().equals(vc.getChr()); + } + + public VariantContext getStartingVC() { return startingVC; } + public int getStart() { return startingVC.getStart(); } + public int getStop() { return stop; } + public Allele getRef() { return ref; } + public int getSize() { return getStop() - getStart() + 1; } + + @Override + public String toString() { + return "HomRefBlock{" + + "minGQ=" + minGQ + + ", maxGQ=" + maxGQ + + '}'; + } + + public VCFHeaderLine toVCFHeaderLine() { + return new VCFHeaderLine("GVCFBlock", "minGQ=" + getGQLowerBound() + "(inclusive),maxGQ=" + getGQUpperBound() + "(exclusive)"); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeLDCalculator.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/haplotype/HaplotypeLDCalculator.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeLDCalculator.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/haplotype/HaplotypeLDCalculator.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/haplotype/LDMerger.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/haplotype/LDMerger.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/haplotype/LDMerger.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/haplotype/LDMerger.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/haplotype/MergeVariantsAcrossHaplotypes.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/haplotype/MergeVariantsAcrossHaplotypes.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/haplotype/MergeVariantsAcrossHaplotypes.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/haplotype/MergeVariantsAcrossHaplotypes.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/ReadDestination.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/haplotypeBAMWriter/ReadDestination.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/ReadDestination.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/haplotypeBAMWriter/ReadDestination.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/ArrayLoglessPairHMM.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/ArrayLoglessPairHMM.java new file mode 100644 index 000000000..e818c9899 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/ArrayLoglessPairHMM.java @@ -0,0 +1,436 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.pairhmm; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.QualityUtils; + +import java.util.Arrays; + +import static org.broadinstitute.sting.utils.pairhmm.PairHMMModel.*; + +/** + * Created with IntelliJ IDEA. + * User: bradt + * Date: 6/11/13 + */ +public class ArrayLoglessPairHMM extends PairHMM { + private static final double INITIAL_CONDITION = Math.pow(2, 1020); + private static final double INITIAL_CONDITION_LOG10 = Math.log10(INITIAL_CONDITION); + + // we divide e by 3 because the observed base could have come from any of the non-observed alleles + protected static final double TRISTATE_CORRECTION = 3.0; + + protected double[][] transition = null; // The transition probabilities cache + protected double[][] prior = null; // The prior probabilities cache + + // Array declarations for arrays implementation + private double[] currentMatchArray = null; + private double[] currentDeleteArray = null; + private double[] currentInsertArray = null; + private double[] parentMatchArray = null; + private double[] parentDeleteArray = null; + private double[] parentInsertArray = null; + private double[] grandparentMatchArray = null; + private double[] grandparentDeleteArray = null; + private double[] grandparentInsertArray = null; + + // When successive haplotypes have a common prefix, these arrays store cached info from the previous haplotype; for reading + private double[] matchCacheArray = null; + private double[] deleteCacheArray = null; + private double[] insertCacheArray = null; + + // These arrays store cache info for use with the next haplotype; for writing + private double[] nextMatchCacheArray = null; + private double[] nextDeleteCacheArray = null; + private double[] nextInsertCacheArray = null; + + // Used when caching to store our intermediate sum at point of first difference bw successive haplotypes + private double partialSum; + + + /** + * {@inheritDoc} + */ + @Override + public void initialize(final int readMaxLength, final int haplotypeMaxLength ) { + super.initialize(readMaxLength, haplotypeMaxLength); + + transition = PairHMMModel.createTransitionMatrix(maxReadLength); + prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; + + // Initialize all arrays + // Final Cell of array is a padding cell, initialized to zero. + currentMatchArray = new double[paddedMaxReadLength]; + currentDeleteArray = new double[paddedMaxReadLength]; + currentInsertArray = new double[paddedMaxReadLength]; + + parentMatchArray = new double[paddedMaxReadLength]; + parentDeleteArray = new double[paddedMaxReadLength]; + parentInsertArray = new double[paddedMaxReadLength]; + + grandparentMatchArray = new double[paddedMaxReadLength]; + grandparentDeleteArray = new double[paddedMaxReadLength]; + grandparentInsertArray = new double[paddedMaxReadLength]; + + // Initialize the special arrays used for caching when successive haplotypes have a common prefix + matchCacheArray = new double[paddedMaxReadLength]; + deleteCacheArray = new double[paddedMaxReadLength]; + insertCacheArray = new double[paddedMaxReadLength]; + + nextMatchCacheArray = new double[paddedMaxReadLength]; + nextDeleteCacheArray = new double[paddedMaxReadLength]; + nextInsertCacheArray = new double [paddedMaxReadLength]; + } + + + /** + * {@inheritDoc} + */ + @Override + public double subComputeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + int hapStartIndex, + final boolean recacheReadValues, + final int nextHapStartIndex) { + + if ( ! constantsAreInitialized) { + initializeProbabilities(transition, insertionGOP, deletionGOP, overallGCP); + + // note that we initialized the constants + constantsAreInitialized = true; + } + initializePriors(haplotypeBases, readBases, readQuals, hapStartIndex); + + // Some housekeeping to be done if we are starting a new read + if (recacheReadValues) { + hapStartIndex = 0; + + initializeProbabilities(transition, insertionGOP, deletionGOP, overallGCP); + // note that we initialized the constants + constantsAreInitialized = true; + + // Read length may have changed, so we need to set zero-value padding at the appropriate position. + padMatchAndInsertArrays(readBases.length); + } + + // if we have not cached from a previous haplotype, clear any info we may have accumulated in a previous HMM iteration + if (hapStartIndex == 0) { + clearPreviouslyCachedInfo(readBases.length); + + // Haplotype length may have changed, so we need to set initial-value padding at the appropriate position. + padDeleteArrays(haplotypeBases.length, readBases.length); + } + + // We build up our solution by looking at position [0] in the match, insert arrays. Need to set these to 0 before we start. + clearArraySolutionPosition(); + + // Some parameters to control behavior during the dynamic programming loop + final int maxDiagonals = readBases.length + haplotypeBases.length - hapStartIndex - 1; // Number of diagonals for a matrix = rows + cols - 1; + int startFill; // The lower bound of the array indices we want to over-write + int endFill; // The upper bound of the array indices we want to over-write + final int cacheSumIndex = nextHapStartIndex - hapStartIndex + readBases.length - 1; // This array will contain the partial sum to cache for the next haplotype + double finalArraySumProbabilities = partialSum; // The final answer prior to log10 correction + + // Perform dynamic programming using arrays, as if over diagonals of a hypothetical read/haplotype alignment matrix + for (int i = 1; i <= maxDiagonals; i++) { + // set the bounds for cells we wish to fill in the arrays + startFill = Math.max(readBases.length - i, 0); + endFill = Math.min(maxDiagonals - i + 1, readBases.length); + + // apply any previously cached array information + if (i <= readBases.length) + applyPreviouslyCachedInfo(startFill); + + // fill in the cells for our current arrays + updateArrays(readBases.length, hapStartIndex, nextHapStartIndex, startFill, endFill, i); + + // final probability is the log10 sum of the last element in the Match and Insertion state arrays + // this way we ignore all paths that ended in deletions! (huge) + // but we have to sum all the paths ending in the M and I arrays, because they're no longer extended. + // Where i > readBases.length, array[0] corresponds to bottom row of a [read] x [haplotype] matrix. Before this, they carries the 0's we set above. + finalArraySumProbabilities += currentInsertArray[0] + currentMatchArray[0]; + + // Partial sum for caching the next haplotype: + // At the position of the last similar base between this haplotype and the next one... + // ...remember the partial sum, so that we can start here on the next hap. + if (i == cacheSumIndex) + partialSum = finalArraySumProbabilities; + + rotateArrayReferences(); + } + // The cache arrays we wrote for this haplotype will be read for the next haplotype. + rotateCacheArrays(); + + //return result + return Math.log10(finalArraySumProbabilities) - INITIAL_CONDITION_LOG10; + } + + /** + * Initializes the matrix that holds all the constants related to the editing + * distance between the read and the haplotype. + * + * @param haplotypeBases the bases of the haplotype + * @param readBases the bases of the read + * @param readQuals the base quality scores of the read + * @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read) + */ + public void initializePriors(final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, final int startIndex) { + + // initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases + // Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2. + + for (int i = 0; i < readBases.length; i++) { + final byte x = readBases[i]; + final byte qual = readQuals[i]; + for (int j = startIndex; j < haplotypeBases.length; j++) { + final byte y = haplotypeBases[j]; + prior[i+1][j+1] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? + QualityUtils.qualToProb(qual) : (QualityUtils.qualToErrorProb(qual) / (doNotUseTristateCorrection ? 1.0 : TRISTATE_CORRECTION)) ); + } + } + } + + /** + * Initializes the matrix that holds all the constants related to quality scores. + * + * @param insertionGOP insertion quality scores of the read + * @param deletionGOP deletion quality scores of the read + * @param overallGCP overall gap continuation penalty + */ + @Requires({ + "insertionGOP != null", + "deletionGOP != null", + "overallGCP != null" + }) + @Ensures("constantsAreInitialized") + protected static void initializeProbabilities(final double[][] transition, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP) { + PairHMMModel.qualToTransProbs(transition,insertionGOP,deletionGOP,overallGCP); + } + + /** + * Pad the ends of the Match and Insert arrays with 0. + * Analogous to setting zeros in the first row in the Match, Insert matrices of N2MemoryPairHMM. + * + * @param padPosition Which index in the arrays we wish to pad + */ + private void padMatchAndInsertArrays(final int padPosition) { + grandparentMatchArray[padPosition] = 0; + grandparentInsertArray[padPosition] = 0; + parentMatchArray[padPosition] = 0; + parentInsertArray[padPosition] = 0; + currentMatchArray[padPosition] = 0; + currentInsertArray[padPosition] = 0; + matchCacheArray[padPosition] = 0; + insertCacheArray[padPosition] = 0; + nextMatchCacheArray[padPosition] = 0; + nextInsertCacheArray[padPosition] = 0; + } + + /** + * Pad the Delete arrays with an intial value. Let's us have free deletions at the beginning of the alignment. + * Analogous to padding the first row of the Delete matrix of N2MemoryPairHMM. + * + * @param haplotypeLength The length of the present haplotype. Necessary for calculating initial padding value + * @param padPosition Which index in the arrays we wish to pad + */ + private void padDeleteArrays(final int haplotypeLength, final int padPosition) { + final double initialValue = INITIAL_CONDITION / haplotypeLength; + + // Pad the deletion arrays. Akin to padding the first row in the deletion matrix + parentDeleteArray[padPosition] = initialValue; + grandparentDeleteArray[padPosition] = initialValue; + currentDeleteArray[padPosition] = initialValue; + deleteCacheArray[padPosition] = initialValue; + nextDeleteCacheArray[padPosition] = initialValue; + } + + /** + * We build up our solution by looking at position [0] in the match, insert arrays. Need to set these to 0 before we start. + * + */ + private void clearArraySolutionPosition() { + grandparentMatchArray[0] = 0; + grandparentInsertArray[0] = 0; + parentMatchArray[0] = 0; + parentInsertArray[0] = 0; + currentMatchArray[0] = 0; + currentInsertArray[0] = 0; + } + + /** + * Clears cached information saved from the last haplotype, + * allowing us to start at the beginning of the present haplotype with intitial values of 0. + * + * @param fillLength How much of the cache arrays do we need to zero + */ + private void clearPreviouslyCachedInfo(final int fillLength) { + Arrays.fill(matchCacheArray, 0, fillLength, 0); + Arrays.fill(deleteCacheArray, 0, fillLength, 0); + Arrays.fill(insertCacheArray, 0, fillLength, 0); + + partialSum = 0; + } + + /** + * Applies cached information saved from the last haplotype, + * allowing us to start in the middle of the present haplotype. + * + * @param indK the index in the arrays we wish to update with cached info + */ + private void applyPreviouslyCachedInfo(int indK) { + // apply caching info necessary for calculating current DELETE array values + parentMatchArray[indK] = matchCacheArray[indK]; + parentDeleteArray[indK] = deleteCacheArray[indK]; + + // apply caching info necessary for calculating current MATCH array values + grandparentMatchArray[indK + 1] = matchCacheArray[indK + 1]; + grandparentDeleteArray[indK + 1] = deleteCacheArray[indK + 1]; + grandparentInsertArray[indK + 1] = insertCacheArray[indK + 1]; + } + + /** + * Records the mid-process state of one location in the read/haplotype alignment. + * Writes new cache information for use with the next haplotype we see. + * + * @param indK the index in the cache arrays we wish to store information in + */ + private void recordNewCacheInfo(int indK) { + nextMatchCacheArray[indK] = currentMatchArray[indK]; + nextDeleteCacheArray[indK] = currentDeleteArray[indK]; + nextInsertCacheArray[indK] = currentInsertArray[indK]; + } + + /** + * Update the HMM arrays for the current diagonal. + * + * @param readLength The length of the read + * @param hapStartIndex An offset that tells us if we are starting in the middle of the present haplotype + * @param nextHapStartIndex An offset that tells us which base in the NEXT haplotype we need to look at to record new caching info + * @param startFill The lower bound of the array indices we want to over-write + * @param endFill The upper bound of the array indices we want to over-write + * @param iii The index indicating which diagonal of the read/haplotype alignment we are working on + */ + private void updateArrays(final int readLength, + final int hapStartIndex, + final int nextHapStartIndex, + final int startFill, + final int endFill, + final int iii) { + + // The coordinate in our priors and transition matrices corresponding to a given position in the read/haplotype alignment + int matrixRow; + int matrixCol; + + int arrayIndex; + for (arrayIndex = startFill; arrayIndex < endFill; arrayIndex++) { + // translate the array position into a row, column in the priors and transition matrices + matrixRow = readLength - arrayIndex - 1; + matrixCol = iii - matrixRow - 1 + hapStartIndex; + + // update cell for each of our current arrays. Prior, transition matrices are padded +1 row,col + updateArrayCell(arrayIndex, prior[matrixRow+1][matrixCol+1], transition[matrixRow+1]); + + // Set up caching for the next haplotype + // At the position of the final similar base between this haplotype and the next one, remember the mid-array values + if (matrixCol == nextHapStartIndex - 1) + recordNewCacheInfo(arrayIndex); + } + } + + /** + * Updates a cell in the HMM arrays + * + * @param indK index in the arrays to update + * @param prior the likelihood editing distance matrix for the read x haplotype + * @param transition an array with the six transition relevant to this location + */ + private void updateArrayCell( final int indK, final double prior, final double[] transition) { + currentMatchArray[indK] = prior * ( grandparentMatchArray[indK + 1] * transition[matchToMatch] + + grandparentInsertArray[indK + 1] * transition[indelToMatch] + + grandparentDeleteArray[indK + 1] * transition[indelToMatch] ); + currentInsertArray[indK] = parentMatchArray[indK + 1] * transition[matchToInsertion] + parentInsertArray[indK + 1] * transition[insertionToInsertion]; + currentDeleteArray[indK] = parentMatchArray[indK] * transition[matchToDeletion] + parentDeleteArray[indK] * transition[deletionToDeletion]; + } + + /** + * To prepare for the next diagonal in our loop, each array must be bumped to an older generation + * + */ + private void rotateArrayReferences() { + double[] tempMatchArray = grandparentMatchArray; + double[] tempDeleteArray = grandparentDeleteArray; + double[] tempInsertArray = grandparentInsertArray; + + grandparentMatchArray = parentMatchArray; + grandparentDeleteArray = parentDeleteArray; + grandparentInsertArray = parentInsertArray; + + parentMatchArray = currentMatchArray; + parentDeleteArray = currentDeleteArray; + parentInsertArray = currentInsertArray; + + currentMatchArray = tempMatchArray; + currentDeleteArray = tempDeleteArray; + currentInsertArray = tempInsertArray; + } + + /** + * To prepare for the next haplotype, the caching info we wrote is copied into the cach-read arrays + * + */ + private void rotateCacheArrays() { + matchCacheArray = nextMatchCacheArray.clone(); + deleteCacheArray = nextDeleteCacheArray.clone(); + insertCacheArray = nextInsertCacheArray.clone(); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CnyPairHMM.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/CnyPairHMM.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/pairhmm/CnyPairHMM.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/CnyPairHMM.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/FastLoglessPairHMM.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/FastLoglessPairHMM.java new file mode 100644 index 000000000..72d5c9472 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/FastLoglessPairHMM.java @@ -0,0 +1,822 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.pairhmm; + +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.PairHMMLikelihoodCalculationEngine; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; + +import static org.broadinstitute.sting.utils.pairhmm.PairHMMModel.*; + +/** + * Fast partial PairHMM backed on the standard Logless PairHMM + * + */ +public class FastLoglessPairHMM extends LoglessPairHMM implements FlexibleHMM { + + + /** + * Initial read length capacity. + */ + private static final int INITIAL_READ_LENGTH_CAPACITY = 200; + + /** + * Initial haplotype length capacity. + */ + private static final int INITIAL_HAPLOTYPE_LENGTH_CAPACITY = 400; + + + /** + * Holds the current read capacity. + *

It can only go up overtime.

+ */ + private int readCapacity = INITIAL_READ_LENGTH_CAPACITY; + + /** + * Holds the current haplotype length capacity. + *

It can only go up overtime.

+ */ + private int haplotypeCapacity = INITIAL_HAPLOTYPE_LENGTH_CAPACITY; + + private int maxToCol; + private int haplotypeLength; + + /** + * Returns the currently loaded read base qualities. + * + * @throws IllegalStateException if no read was previously loaded using {@link #loadRead}. + * @return never {@code null}. + */ + public byte[] getReadQuals() { + if (readQuals == null) + throw new IllegalStateException("no read was loaded onto the pairhmm calculator"); + return readQuals; + } + + /** + * Returns the currently loaded read insertion qualities. + * + * @throws IllegalStateException if no read was previously loaded using {@link #loadRead}. + * @return never {@code null}. + */ + @SuppressWarnings("unused") + public byte[] getReadInsQuals() { + if (readQuals == null) + throw new IllegalStateException("no read was loaded onto the pairhmm calculator"); + return readInsQuals; + } + + /** + * Returns the currently loaded read deletion qualities. + * + * @throws IllegalStateException if no read was previously loaded using {@link #loadRead}. + * @return never {@code null}. + */ + @SuppressWarnings("unused") + public byte[] getReadDelQuals() { + if (readQuals == null) + throw new IllegalStateException("no read was loaded onto the pairhmm calculator"); + return readDelQuals; + } + + /** + * Returns the currently loaded read gap extension penalty.. + * + * @throws IllegalStateException if no read was previously loaded using {@link #loadRead}. + * @return never {@code null}. + */ + @SuppressWarnings("unused") + public byte[] getReadGepQuals() { + if (readQuals == null) + throw new IllegalStateException("no read was loaded onto the pairhmm calculator"); + return readGepQuals; + } + + + /** + * Creates a new pair-hmm calculator instance give the gap continuation penalty. + * + * @param gcp the gap-continuation penalty. + */ + public FastLoglessPairHMM(final byte gcp) { + constantGCP = gcp; + initialize(readCapacity,haplotypeCapacity); + } + + @Override + public byte getGapExtensionPenalty() { + return constantGCP; + } + + + @Override + public double subComputeReadLikelihoodGivenHaplotypeLog10(final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final int hapStartIndex, + final boolean recacheReadValues, final int nextHapStartIndex) { + this.readBases = readBases; + this.haplotypeBases = haplotypeBases; + this.haplotypeLength = haplotypeBases.length; + return super.subComputeReadLikelihoodGivenHaplotypeLog10(haplotypeBases,readBases,readQuals, + insertionGOP,deletionGOP,overallGCP,hapStartIndex,recacheReadValues,nextHapStartIndex); + } + + /** + * Implement the last step summation to calculate the total likelihood. + * + * @param row number of the last row of the pair-hmm where the likelihood values are present. + * @param fromCol inclusive first column to include in the summation. + * @param toCol exclusive last column to include in the summation. + * @return 0 or less. + */ + protected double finalLikelihoodCalculation(final int row, + final int fromCol, final int toCol) { + + final double divider = Math.max(1,2 *(toCol - fromCol)); + final double dividerInverse = 1.0 / divider; + double finalLikelihood = 0; + + for (int j = fromCol; j < toCol; j++) { + finalLikelihood += matchMatrix[row][j] * dividerInverse; + finalLikelihood += insertionMatrix[row][j] * dividerInverse; + } + return StrictMath.log10(finalLikelihood) - INITIAL_CONDITION_LOG10 + StrictMath.log10(divider); + } + + /** + * Initialize the matrix values for a problem including the trailing end of the read. + * + *

+ * Notice that you can improve performance by omitting filling reusable values from + * previous haplotype calculations. You can set {@code haplotypeStartOffset} to skill + * those columns. + *

+ * + * @param readStart inclusive first position of the read used in the calculations. + * @param readEnd exclusive last position of the read considered in the calculations. + * @param haplotypeStartOffset offset of the haplotype right after the reusable prefix + * from previous calls. + * + * + */ + protected void initializeMatrixValuesForTrailingProblem(final int readStart, final int readEnd, + final int haplotypeStartOffset) { + + @SuppressWarnings("all") + final int zeroRow = readStart; + final int toRow = readEnd + 1; + final int toCol = haplotypeLength + 1; + + // fill first row with -Inf fot M and I but not for Deletion if leading + // to allow for free deletions at the beginning. + if (readStart == 0) { + // First row initialization: + Arrays.fill(matchMatrix[zeroRow],haplotypeStartOffset,toCol,0); + Arrays.fill(deletionMatrix[zeroRow],haplotypeStartOffset,toCol,INITIAL_CONDITION); + + if (haplotypeStartOffset == 0) + for (int i = zeroRow + 1; i < toRow; i++) + insertionMatrix[i][0] = matchMatrix[i][0] = deletionMatrix[i][0] = 0; + + } else { + Arrays.fill(matchMatrix[zeroRow], Math.max(1,haplotypeStartOffset), toCol,0); + Arrays.fill(insertionMatrix[zeroRow], haplotypeStartOffset, toCol,0); + if (haplotypeStartOffset == 0) { + matchMatrix[zeroRow][0] = INITIAL_CONDITION; + deletionMatrix[zeroRow][0] = 0; + } + if (haplotypeStartOffset <= 1) deletionMatrix[zeroRow][1] = matchMatrix[zeroRow][1] * transition[zeroRow][matchToDeletion]; + for (int i = Math.max(haplotypeStartOffset,2); i < toCol; i++) { + deletionMatrix[zeroRow][i] = deletionMatrix[zeroRow][i - 1] + * transition[zeroRow][deletionToDeletion]; + } + + if (haplotypeStartOffset == 0) { + matchMatrix[zeroRow + 1][0] = deletionMatrix[zeroRow + 1][0] = 0; + insertionMatrix[zeroRow + 1][0] = matchMatrix[zeroRow][0] * transition[zeroRow + 1][matchToInsertion]; + + + for (int i = zeroRow + 2; i < toRow; i++) { + matchMatrix[i][0] = deletionMatrix[i][0] = 0; + insertionMatrix[i][0] = insertionMatrix[i - 1][0] + * transition[i][insertionToInsertion]; + } + } + } + } + + /** + * Initializes calculation matrices give the characteristics of the next and previous problems. + * @param currentProblem reference to the Lk calculation problem we are dealing currently. + * @param previousProblem reference to the Lk calculation problem that has been solved just before. + * + */ + protected void initializeMatrixValues(final Problem currentProblem, final Problem previousProblem) { + if (previousProblem != null && + previousProblem.readStart == currentProblem.readStart && + previousProblem.hapStart == currentProblem.hapStart && + maxToCol >= currentProblem.hapEnd + 1) + return; + + final int zeroRow = currentProblem.readStart; + final int zeroCol = currentProblem.hapStart; + final int toRow = currentProblem.readEnd + 1; + final int toCol = currentProblem.hapEnd + 1; + maxToCol = toCol; + + // fill first row with -Inf fot M and I but not for Deletion if leading + // to allow for free deletions at the beginning. + if (currentProblem.leading) { + // First row initialization: + Arrays.fill(matchMatrix[zeroRow],zeroCol,toCol,0); + Arrays.fill(deletionMatrix[zeroRow],zeroCol,toCol,INITIAL_CONDITION); + + for (int i = zeroRow + 1; i < toRow; i++) + insertionMatrix[i][zeroCol] = matchMatrix[i][zeroCol] = deletionMatrix[i][zeroCol] = 0; + + } else { // If not leading set the appropriate matching 1.0 prob and + // deletion + extension. + + Arrays.fill(matchMatrix[zeroRow], zeroCol + 1, toCol,0); + Arrays.fill(insertionMatrix[zeroRow], zeroCol, toCol,0); + matchMatrix[zeroRow][zeroCol] = INITIAL_CONDITION; + deletionMatrix[zeroRow][zeroCol] = 0; + deletionMatrix[zeroRow][zeroCol + 1] = matchMatrix[zeroRow][zeroCol] * transition[zeroRow][matchToDeletion]; + for (int i = zeroCol + 2; i < toCol; i++) { + deletionMatrix[zeroRow][i] = deletionMatrix[zeroRow][i - 1] + * transition[zeroRow][deletionToDeletion]; + } + + matchMatrix[zeroRow + 1][zeroCol] = deletionMatrix[zeroRow + 1][zeroCol] = 0; + insertionMatrix[zeroRow + 1][zeroCol] = matchMatrix[zeroRow][zeroCol] * transition[zeroRow + 1][matchToInsertion]; + + for (int i = zeroRow + 2; i < toRow; i++) { + matchMatrix[i][zeroCol] = deletionMatrix[i][zeroCol] = 0; + insertionMatrix[i][zeroCol] = insertionMatrix[i - 1][zeroCol] + * transition[i][insertionToInsertion]; + } + } + } + + /** + * Constant gap-continuation-penalty. + */ + private final byte constantGCP; + + /** + * Currently loaded haplotype base sequence. + */ + private byte[] haplotypeBases; + + /** + * Currently loaded read base sequence. + */ + private byte[] readBases; + + /** + * Read qualities. + */ + private byte[] readQuals; + + /** + * Read insertion qualities. + */ + private byte[] readInsQuals; + + /** + * Read deletion qualities. + */ + private byte[] readDelQuals; + + /** + * Read gap-extension-penalties. + */ + private byte[] readGepQuals; + + /** + * Cached results. + */ + private Map cachedResults = new HashMap<>(); + + /** + * Loads the read that is going to be evaluated in following calls to {@link #calculateLocalLikelihoods}. + * + * @param read the target read. + * @throws NullPointerException if {@code read} is null. + */ + @Override + public void loadRead(final GATKSAMRecord read) { + loadRead(read.getReadBases(),read.getBaseQualities(),read.getBaseInsertionQualities(),read.getBaseDeletionQualities(),read.getMappingQuality()); + } + + /** + * Loads the read that is going to be evaluated in following calls to {@link #calculateLocalLikelihoods}. + * + * @param readBases the read bases. + * @param readQuals the read base call quality scores. + * @param readInsQuals the read insertion quality scores. + * @param readDelQuals the read deletion quality scores. + * @param mq the read mapping quality score. + * @throws NullPointerException if any of the arrays passed is {@code null}. + * @throws IllegalArgumentException if the arrays passed have incompatible sizes. + */ + public void loadRead(final byte[] readBases, final byte[] readQuals, final byte[] readInsQuals, final byte[] readDelQuals, int mq) { + // TODO This is a copy&paste from PairHMM*Engine read data preparation code. + // TODO It is simply to difficult to share the code without changing that class and I don't want + // TODO to do so for now. + if (readBases.length != readQuals.length) throw new IllegalArgumentException("the read quality array length does not match the read base array length"); + if (readBases.length != readInsQuals.length) throw new IllegalArgumentException("the read insert quality array length does not match the read base array length"); + if (readBases.length != readDelQuals.length) throw new IllegalArgumentException("the read deletion quality length does not match the read base array length"); + maxToCol = 0; + + if (readBases.length > readCapacity) { + readCapacity = readBases.length; + initialize(readCapacity,haplotypeCapacity); + } + paddedReadLength = readBases.length + 1; + final byte[] overallGCP = new byte[readBases.length]; + Arrays.fill(overallGCP, constantGCP); // Is there a way to derive + + for (int kkk = 0; kkk < readQuals.length; kkk++) { + readQuals[kkk] = (byte) Math.min(0xff & readQuals[kkk], + mq); // cap base quality by mapping + readQuals[kkk] = (byte) (readQuals[kkk] < PairHMMLikelihoodCalculationEngine.BASE_QUALITY_SCORE_THRESHOLD ? QualityUtils.MIN_USABLE_Q_SCORE + : Math.max(QualityUtils.MIN_USABLE_Q_SCORE,readQuals[kkk])); + readInsQuals[kkk] = (byte) Math.max(QualityUtils.MIN_USABLE_Q_SCORE,readInsQuals[kkk]); + readDelQuals[kkk] = (byte) Math.max(QualityUtils.MIN_USABLE_Q_SCORE,readDelQuals[kkk]); + } + this.readBases = readBases; + this.readQuals = readQuals; + this.readInsQuals = readInsQuals; + this.readDelQuals = readDelQuals; + this.readGepQuals = overallGCP; + initializeProbabilities(transition,readInsQuals, readDelQuals, overallGCP); + if (haplotypeBases != null) + fillPriorsTable(0); + cachedResults.clear(); + } + + @Override + public void loadHaplotypeBases(final byte[] haplotypeBases) { + if (readBases == null) + throw new IllegalStateException( + "no read was loaded before the haplotype"); + this.haplotypeBases = haplotypeBases.clone(); + haplotypeLength = haplotypeBases.length; + paddedHaplotypeLength = haplotypeLength; + if (haplotypeCapacity < haplotypeLength) { + haplotypeCapacity = haplotypeLength; + initialize(readCapacity,haplotypeCapacity); + initializeProbabilities(transition, readInsQuals, readDelQuals, readGepQuals); + } + initializePriors(this.haplotypeBases, readBases, readQuals, 0); + } + + + /** + * Changes only the suffix of the currently loaded haplotype. + * + *

+ * If from is 0, this is equivalent to call {@link #loadHaplotypeBases(byte[])} directly. + *

+ * @param from first position on the current haplotype to substitute with the new suffix. + * It can be up to the length of the haplotype in such case this operation is in + * effect just extending that haplotype. + * @param suffix the new bases for the end part of the current haplotype. + * @param suffixFrom inclusive first position of the actual suffix within the {@code suffix} array. + * @param suffixTo exclusive last position of the actual suffix within the {@code suffix} array. + * + * @throws IllegalStateException if no read was loaded with {@link #loadRead}. + * @throws IllegalArgumentException if from is more than 0 but no haplotype was loaded previously or if indices passed are inconsistent. + * @throws ArrayIndexOutOfBoundsException if indices passed are outside valid ranges. + */ + public void changeHaplotypeSuffix(final int from, final byte[] suffix, final int suffixFrom, final int suffixTo) { + if (readBases == null) + throw new IllegalStateException( + "no read was loaded before the haplotype"); + if (haplotypeBases == null && from > 0) + throw new IllegalArgumentException("from cannot be larger than 0 if no haplotype bases was previously loaded"); + if (suffixFrom < 0) + throw new ArrayIndexOutOfBoundsException("the suffix from index cannot be negative"); + if (suffixTo > suffix.length) + throw new ArrayIndexOutOfBoundsException("the suffix to index cannot be larger than the suffix array length"); + if (suffixFrom > suffixTo) + throw new IllegalArgumentException("the suffix to index cannot be smaller than the suffix from index"); + if (from > haplotypeLength) + throw new IllegalArgumentException("the from index cannot be greater than the current haplotype length"); + if (from < 0) + throw new IllegalArgumentException("the from index cannot be negative"); + + int startIndex = from; + if (haplotypeBases == null) { + haplotypeBases = Arrays.copyOfRange(suffix,suffixFrom,suffixTo); + haplotypeLength = suffixTo - suffixFrom; + } else { + final int newLength = from + suffixTo - suffixFrom; + if (haplotypeBases.length < newLength) + haplotypeBases = Arrays.copyOf(haplotypeBases,newLength); + System.arraycopy(suffix,suffixFrom,haplotypeBases,from,newLength - from); + haplotypeLength = newLength; + } + paddedHaplotypeLength = haplotypeLength + 1; + if (haplotypeCapacity < haplotypeLength) { + haplotypeCapacity = haplotypeLength; + initialize(readCapacity,haplotypeCapacity); + initializeProbabilities(transition, readInsQuals, readDelQuals, readGepQuals); + startIndex = 0; + } + //startIndex = 0; + fillPriorsTable(startIndex); + } + + /** + * Returns the bases of the current haplotype. + * + * @throws IllegalStateException if no haplotype was loaded previously + * @return never {@code null} + */ + public byte[] getHaplotypeBases() { + if (haplotypeBases == null) + throw new IllegalStateException(); + return Arrays.copyOfRange(haplotypeBases,0,haplotypeLength); + } + + /** + * Returns a debug representation of the pair-hmm. + * @return never {@code null}. + */ + public String toString() { + return "" + haplotypeLength + ":" + new String(Arrays.copyOfRange(haplotypeBases,0,haplotypeLength)); + } + + @Override + protected void initializePriors(final byte[] hapBases, final byte[] readBases, final byte[] baseQuals, final int idx) { + haplotypeBases = hapBases; + haplotypeLength = haplotypeBases.length; + this.readBases = readBases; + this.readQuals = baseQuals; + fillPriorsTable(idx); + } + + /** + * Fills the prior table up. + * + *

+ * It accepts an argument to save unnecessary prefix filling up. + *

+ * + * @param idx first position in the haplotype to start filling from. + */ + protected void fillPriorsTable(final int idx) { + for (int i = 0; i < readBases.length; i++) { + final byte x = readBases[i]; + final byte qual = readQuals[i]; + for (int j = idx; j < haplotypeLength; j++) { + final byte y = haplotypeBases[j]; + prior[i+1][j+1] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? + QualityUtils.qualToProb(qual) : (QualityUtils.qualToErrorProb(qual) / (doNotUseTristateCorrection ? 1.0 : TRISTATE_CORRECTION)) ); + } + } + } + + + /** + * Decorates haplotype set with their likelihoods as compared with the currently loaded read. + * + * + * @param readStart inclusive start position of the targeted section of the read. + * @param readEnd exclusive end position just beyond the targeted section of the read. + * @param haplotypes in/out set of haplotypes. + */ + public void calculateLocalLikelihoods(final int readStart, final int readEnd, final PairHMMReadyHaplotypes haplotypes) { + final PairHMMReadyHaplotypes.Iterator entryIterator = haplotypes.iterator(); + boolean isFirst = true; + while (entryIterator.hasNext()) { + entryIterator.next(); + final int startIndex = entryIterator.startIndex(); + final byte[] bases = entryIterator.bases(); + changeHaplotypeSuffix(startIndex,bases,startIndex,bases.length); + final double likelihood = calculateLikelihood(readStart, readEnd, startIndex, isFirst); + isFirst = false; + entryIterator.setLikelihood(likelihood); + } + } + + + + @Override + public double calculateLocalLikelihood(final int readStart, final int readEnd, + final int hapStart, final int hapEnd, final boolean kmerMatch) { + if (readBases == null || haplotypeBases == null) + throw new IllegalStateException("read or haplotype was not loaded"); + final int hapSegmentLength = hapEnd - hapStart; + final int readSegmentLength = readEnd - readStart; + // trivial case when there is a single base match. + if (kmerMatch) { + return calculateLocalLikelihoodsExactMatch(readStart, hapStart, hapSegmentLength, readSegmentLength); + } else if (hapSegmentLength == readSegmentLength) { + if (hapSegmentLength == 0) { + return calculateLocalLikelihoodEmptySquare(readStart, readEnd); + } else if (hapSegmentLength == 1) { + return calculateLocalLikelihoodSingleBase(readStart, readEnd, hapStart); + } else { // general (slower) solution. + return calculateLocalLikelihoodsGeneral(readStart, readEnd, hapStart, hapEnd); + } + } else if (hapSegmentLength == 0) { // must be full insertion we + return calculateLocalLikelihoodInsertion(readStart, readEnd); + } else if (readSegmentLength == 0) { // full deletion. + return calculateLocalLikelihoodDeletion(readStart, hapStart, hapEnd); + } else { // general (slower) solution. + return calculateLocalLikelihoodsGeneral(readStart, readEnd, hapStart, hapEnd); + } + } + + /** + * Fast likelihood when the pair-hmm represents a deletion in the read. + */ + private double calculateLocalLikelihoodDeletion(final int readStart, final int hapStart, final int hapEnd) { + double result = INITIAL_CONDITION; + if (readStart > 0) { // no penalty if at the beginning. + result *= transition[readStart][matchToDeletion]; + result *= + StrictMath.pow(transition[readStart][deletionToDeletion],hapEnd - hapStart - 1); + result *= transition[readStart][indelToMatch]; + } + return StrictMath.log10(result) - INITIAL_CONDITION_LOG10; + } + + + /** + * Fast likelihood when the pair-hmm represents a insertion in the read. + */ + private double calculateLocalLikelihoodInsertion(final int readStart, final int readEnd) { + double result = INITIAL_CONDITION; + result *= transition[readStart + 1][matchToInsertion]; + for (int i = readStart + 1; i < readEnd; i++) { + result *= transition[i + 1][insertionToInsertion]; + } + if (readEnd < readBases.length) { + result *= transition[readEnd + 1][indelToMatch]; + } + return StrictMath.log10(result) - INITIAL_CONDITION_LOG10; + } + + /** + * Single base mismatch fast likelihood calculation. + */ + private double calculateLocalLikelihoodSingleBase(final int readStart, final int readEnd, final int hapStart) { + double result = INITIAL_CONDITION; + result *= prior[readStart + 1][hapStart + 1]; + if (readStart > 0) { + result *= transition[readStart + 1][matchToMatch]; + } + if (readEnd < readBases.length) { + result *= transition[readEnd + 1][matchToMatch]; + } + return StrictMath.log10(result) - INITIAL_CONDITION_LOG10; + } + + /** + * Empty square Pair-hmm. + */ + private double calculateLocalLikelihoodEmptySquare(final int readStart, final int readEnd) { + double result = INITIAL_CONDITION; + if (readStart > 0 && readEnd < readBases.length) { + result *= transition[readStart + 1][matchToMatch]; + } + return StrictMath.log10(result) - INITIAL_CONDITION_LOG10; + } + + /** + * Likelihood assuming that there is a exact match between both sequences: read and haplotype + */ + private double calculateLocalLikelihoodsExactMatch(final int readStart, final int hapStart, final int hapSegmentLength, final int readSegmentLength) { + double result = INITIAL_CONDITION; + if (hapSegmentLength == 1) { + result *= prior[readStart + 1][hapStart + 1]; + } else { + for (int i = 0; i < readSegmentLength; i++) { + result *= prior[readStart + i + 1][hapStart + i + 1]; + if (i > 0) { + result *= transition[readStart + i + 1][matchToMatch]; + } + } + } + return StrictMath.log10(result) - INITIAL_CONDITION_LOG10; + } + + /** + * Revert to a general pair-hmm solution. + */ + private double calculateLocalLikelihoodsGeneral(final int readStart, final int readEnd, final int hapStart, final int hapEnd) { + final Problem p = new Problem(readStart, readEnd, hapStart, hapEnd); + final Double cachedCost = cachedResults.get(p); + if (cachedCost != null) { + return cachedCost; + } + double cost = calculateLocalLikelihoodGeneral(p); + cachedResults.put(p, cost); + return cost; + } + + /** + * Resolve the regular full pair-hmm. + * + *

+ * With the possibility of reuse the previous haplotype common prefix by using + * a startIndex which is greater than 0. + */ + private double calculateLikelihood(final int readStart, final int readEnd, final int startIndex, final boolean initializeEdges) { + final int edgeStart = initializeEdges ? 0 : startIndex + 1; + initializeMatrixValuesForTrailingProblem(readStart, readEnd, edgeStart); + updateTable(readStart + 1, readEnd + 1, startIndex + 1, haplotypeLength + 1); + if (readEnd == readBases.length) + return finalLikelihoodCalculation(readEnd,0,haplotypeLength + 1) - (readStart == 0 ? StrictMath.log10(haplotypeLength) : 0); + else { + final double divider = 3.0; + final double dividerInverted = 1.0 / divider; + return StrictMath.log10(matchMatrix[readEnd][haplotypeLength] + * transition[readEnd][matchToMatch] * dividerInverted + + insertionMatrix[readEnd][haplotypeLength] + * transition[readEnd][indelToMatch] * dividerInverted + + deletionMatrix[readEnd][haplotypeLength] + * transition[readEnd][indelToMatch] * dividerInverted) - INITIAL_CONDITION_LOG10 + StrictMath.log10(divider); + } + } + + + private double calculateLocalLikelihoodGeneral(final Problem p) { + + initializeMatrixValues(p,null); + // int fromCol = p.hapStart + 1; + // if (previousProblem == null) { + // fromCol = p.hapStart + 1; + // } else { + // final int sharedPrefix = previousProblem.followerStartIndex(p); + // if (sharedPrefix >= 0) + // fromCol = sharedPrefix + 1; + // else + // fromCol = p.hapStart + 1; + // } + // previousProblem = p; + + updateTable(p.readStart + 1, p.readEnd + 1, + p.hapStart + 1, p.hapEnd + 1); + + if (p.trailing) { + return finalLikelihoodCalculation(p.readEnd,p.hapStart,p.hapEnd + 1) + - (p.leading ? StrictMath.log10(p.hapEnd - p.hapStart) : 0); + } else { + final double divider = 3.0; + final double dividerInverted = 1.0 / divider; + return StrictMath.log10(matchMatrix[p.readEnd][p.hapEnd] + * transition[p.readEnd][matchToMatch] * dividerInverted + + insertionMatrix[p.readEnd][p.hapEnd] + * transition[p.readEnd][indelToMatch] * dividerInverted + + deletionMatrix[p.readEnd][p.hapEnd] + * transition[p.readEnd][indelToMatch] * dividerInverted) - INITIAL_CONDITION_LOG10 + StrictMath.log10(divider); + } + } + + private void updateTable(final int rowFrom, final int rowTo, + final int colFrom, final int colTo) { + + for (int i = rowFrom; i < rowTo; i++) { + for (int j = colFrom; j < colTo; j++) { + updateCell(i, j, prior[i][j], transition[i]); + } + } + + } + + /** + * Holds the properties of a pair-hmm computational problem. + */ + public class Problem { + private final byte[] haplotypeSegment; + private final int readStart; + private final int readEnd; + private final int hapStart; + private final int hapEnd; + private final int hashCode; + private final boolean trailing; + private final boolean leading; + + /** + * Construct a new project object. + * @param start inclusive start position on the read to consider. + * @param end exclusive after last position on the read to consider. + * @param hapStart inclusive start position on the haplotype to consider. + * @param hapEnd exclusive after last position on the haplotype to consider. + */ + public Problem(final int start, final int end, final int hapStart, + final int hapEnd) { + if (start < 0 || start > readBases.length) + throw new IllegalArgumentException("bad start index " + start); + if (end < start || end > readBases.length) + throw new IllegalArgumentException("bad end index " + end + " < " + start + " or " + end + " > " + readBases.length); + if (hapStart < 0 || hapStart > haplotypeLength) + throw new IllegalArgumentException("bad hap start index " + + hapStart + " is larger than the haplotypeLength " + haplotypeLength); + if (hapEnd < hapStart || hapEnd > haplotypeLength) + throw new IllegalArgumentException("bad hap end index " + + hapEnd + " outside [" + hapStart + "," + + haplotypeLength + "]"); + + haplotypeSegment = Arrays.copyOfRange(haplotypeBases, hapStart, hapEnd); + readStart = start; + readEnd = end; + this.hapStart = hapStart; + this.hapEnd = hapEnd; + trailing = readEnd == readBases.length; + leading = readStart == 0; + + hashCode = ((start * 31 + end) * 31 + Arrays.hashCode(haplotypeSegment) * 31); + } + + @Override + public int hashCode() { + return hashCode; + } + + @Override + public boolean equals(Object o) { + if (o == this) + return true; + else if (o == null) + return false; + else if (o.getClass() != this.getClass()) + return false; + else { + final Problem p = (Problem) o; + return (p.hashCode == this.hashCode) && (p.readStart == this.readStart) && (p.readEnd == this.readEnd) && Arrays.equals(haplotypeSegment, p.haplotypeSegment); + } + } + + + } + + /** + * Returns the currently loaded read base calls. + * @return {@code never null}. + */ + public byte[] getReadBases() { + if (readBases == null) + throw new IllegalStateException("no read was previously loaded."); + return readBases; + } + + +} diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/FlexibleHMM.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/FlexibleHMM.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/pairhmm/FlexibleHMM.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/FlexibleHMM.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java new file mode 100644 index 000000000..31a0d1363 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java @@ -0,0 +1,180 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.pairhmm; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.QualityUtils; + +import static org.broadinstitute.sting.utils.pairhmm.PairHMMModel.*; + + +/** + * Created with IntelliJ IDEA. + * User: rpoplin, carneiro + * Date: 10/16/12 + */ +public class LoglessPairHMM extends N2MemoryPairHMM { + protected static final double INITIAL_CONDITION = Math.pow(2, 1020); + protected static final double INITIAL_CONDITION_LOG10 = Math.log10(INITIAL_CONDITION); + + // we divide e by 3 because the observed base could have come from any of the non-observed alleles + protected static final double TRISTATE_CORRECTION = 3.0; + + + + /** + * {@inheritDoc} + */ + @Override + public double subComputeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final int hapStartIndex, + final boolean recacheReadValues, + final int nextHapStartIndex) { + + if (previousHaplotypeBases == null || previousHaplotypeBases.length != haplotypeBases.length) { + final double initialValue = INITIAL_CONDITION / haplotypeBases.length; + // set the initial value (free deletions in the beginning) for the first row in the deletion matrix + for( int j = 0; j < paddedHaplotypeLength; j++ ) { + deletionMatrix[0][j] = initialValue; + } + } + + if ( ! constantsAreInitialized || recacheReadValues ) { + initializeProbabilities(transition, insertionGOP, deletionGOP, overallGCP); + + // note that we initialized the constants + constantsAreInitialized = true; + } + + initializePriors(haplotypeBases, readBases, readQuals, hapStartIndex); + + for (int i = 1; i < paddedReadLength; i++) { + // +1 here is because hapStartIndex is 0-based, but our matrices are 1 based + for (int j = hapStartIndex+1; j < paddedHaplotypeLength; j++) { + updateCell(i, j, prior[i][j], transition[i]); + } + } + + // final probability is the log10 sum of the last element in the Match and Insertion state arrays + // this way we ignore all paths that ended in deletions! (huge) + // but we have to sum all the paths ending in the M and I matrices, because they're no longer extended. + final int endI = paddedReadLength - 1; + double finalSumProbabilities = 0.0; + for (int j = 1; j < paddedHaplotypeLength; j++) { + finalSumProbabilities += matchMatrix[endI][j] + insertionMatrix[endI][j]; + } + return Math.log10(finalSumProbabilities) - INITIAL_CONDITION_LOG10; + } + + /** + * Initializes the matrix that holds all the constants related to the editing + * distance between the read and the haplotype. + * + * @param haplotypeBases the bases of the haplotype + * @param readBases the bases of the read + * @param readQuals the base quality scores of the read + * @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read) + */ + protected void initializePriors(final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, final int startIndex) { + + // initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases + // Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2. + + for (int i = 0; i < readBases.length; i++) { + final byte x = readBases[i]; + final byte qual = readQuals[i]; + for (int j = startIndex; j < haplotypeBases.length; j++) { + final byte y = haplotypeBases[j]; + prior[i+1][j+1] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? + QualityUtils.qualToProb(qual) : (QualityUtils.qualToErrorProb(qual) / (doNotUseTristateCorrection ? 1.0 : TRISTATE_CORRECTION)) ); + } + } + } + + /** + * Initializes the matrix that holds all the constants related to quality scores. + * + * @param insertionGOP insertion quality scores of the read + * @param deletionGOP deletion quality scores of the read + * @param overallGCP overall gap continuation penalty + */ + @Requires({ + "insertionGOP != null", + "deletionGOP != null", + "overallGCP != null" + }) + @Ensures("constantsAreInitialized") + protected static void initializeProbabilities(final double[][] transition, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP) { + PairHMMModel.qualToTransProbs(transition,insertionGOP,deletionGOP,overallGCP); + } + + /** + * Updates a cell in the HMM matrix + * + * The read and haplotype indices are offset by one because the state arrays have an extra column to hold the + * initial conditions + + * @param indI row index in the matrices to update + * @param indJ column index in the matrices to update + * @param prior the likelihood editing distance matrix for the read x haplotype + * @param transition an array with the six transition relevant to this location + */ + protected void updateCell( final int indI, final int indJ, final double prior, final double[] transition) { + + matchMatrix[indI][indJ] = prior * ( matchMatrix[indI - 1][indJ - 1] * transition[matchToMatch] + + insertionMatrix[indI - 1][indJ - 1] * transition[indelToMatch] + + deletionMatrix[indI - 1][indJ - 1] * transition[indelToMatch] ); + insertionMatrix[indI][indJ] = matchMatrix[indI - 1][indJ] * transition[matchToInsertion] + insertionMatrix[indI - 1][indJ] * transition[insertionToInsertion]; + deletionMatrix[indI][indJ] = matchMatrix[indI][indJ - 1] * transition[matchToDeletion] + deletionMatrix[indI][indJ - 1] * transition[deletionToDeletion]; + } +} diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/BQSRReadTransformer.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/BQSRReadTransformer.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/BQSRReadTransformer.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/BQSRReadTransformer.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/QualQuantizer.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/QualQuantizer.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/QualQuantizer.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/QualQuantizer.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java new file mode 100644 index 000000000..6efed2689 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java @@ -0,0 +1,170 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.recalibration; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.LRUCache; + +/** + * The object temporarily held by a read that describes all of it's covariates. + * + * In essence, this is an array of CovariateValues, but it also has some functionality to deal with the optimizations of the NestedHashMap + * + * @author Mauricio Carneiro + * @since 2/8/12 + */ +public class ReadCovariates { + private final static Logger logger = Logger.getLogger(ReadCovariates.class); + + /** + * How big should we let the LRU cache grow + */ + private static final int LRU_CACHE_SIZE = 500; + + /** + * Use an LRU cache to keep cache of keys (int[][][]) arrays for each read length we've seen. + * The cache allows us to avoid the expense of recreating these arrays for every read. The LRU + * keeps the total number of cached arrays to less than LRU_CACHE_SIZE. + * + * This is a thread local variable, so the total memory required may grow to N_THREADS x LRU_CACHE_SIZE + */ + private final static ThreadLocal> keysCache = new ThreadLocal>() { + @Override protected LRUCache initialValue() { + return new LRUCache(LRU_CACHE_SIZE); + } + }; + + /** + * The keys cache is only valid for a single covariate count. Normally this will remain constant for the analysis. + * If running multiple analyses (or the unit test suite), it's necessary to clear the cache. + */ + public static void clearKeysCache() { + keysCache.remove(); + } + + /** + * Our keys, indexed by event type x read length x covariate + */ + private final int[][][] keys; + + /** + * The index of the current covariate, used by addCovariate + */ + private int currentCovariateIndex = 0; + + public ReadCovariates(final int readLength, final int numberOfCovariates) { + final LRUCache cache = keysCache.get(); + final int[][][] cachedKeys = cache.get(readLength); + if ( cachedKeys == null ) { + // There's no cached value for read length so we need to create a new int[][][] array + if ( logger.isDebugEnabled() ) logger.debug("Keys cache miss for length " + readLength + " cache size " + cache.size()); + keys = new int[EventType.values().length][readLength][numberOfCovariates]; + cache.put(readLength, keys); + } else { + keys = cachedKeys; + } + } + + public void setCovariateIndex(final int index) { + currentCovariateIndex = index; + } + + /** + * Update the keys for mismatch, insertion, and deletion for the current covariate at read offset + * + * NOTE: no checks are performed on the number of covariates, for performance reasons. If the count increases + * after the keysCache has been accessed, this method will throw an ArrayIndexOutOfBoundsException. This currently + * only occurs in the testing harness, and we don't anticipate that it will become a part of normal runs. + * + * @param mismatch the mismatch key value + * @param insertion the insertion key value + * @param deletion the deletion key value + * @param readOffset the read offset, must be >= 0 and <= the read length used to create this ReadCovariates + */ + public void addCovariate(final int mismatch, final int insertion, final int deletion, final int readOffset) { + keys[EventType.BASE_SUBSTITUTION.ordinal()][readOffset][currentCovariateIndex] = mismatch; + keys[EventType.BASE_INSERTION.ordinal()][readOffset][currentCovariateIndex] = insertion; + keys[EventType.BASE_DELETION.ordinal()][readOffset][currentCovariateIndex] = deletion; + } + + /** + * Get the keys for all covariates at read position for error model + * + * @param readPosition + * @param errorModel + * @return + */ + public int[] getKeySet(final int readPosition, final EventType errorModel) { + return keys[errorModel.ordinal()][readPosition]; + } + + public int[][] getKeySet(final EventType errorModel) { + return keys[errorModel.ordinal()]; + } + + // ---------------------------------------------------------------------- + // + // routines for testing + // + // ---------------------------------------------------------------------- + + protected int[][] getMismatchesKeySet() { return getKeySet(EventType.BASE_SUBSTITUTION); } + protected int[][] getInsertionsKeySet() { return getKeySet(EventType.BASE_INSERTION); } + protected int[][] getDeletionsKeySet() { return getKeySet(EventType.BASE_DELETION); } + + protected int[] getMismatchesKeySet(final int readPosition) { + return getKeySet(readPosition, EventType.BASE_SUBSTITUTION); + } + + protected int[] getInsertionsKeySet(final int readPosition) { + return getKeySet(readPosition, EventType.BASE_INSERTION); + } + + protected int[] getDeletionsKeySet(final int readPosition) { + return getKeySet(readPosition, EventType.BASE_DELETION); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/RecalDatum.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/RecalDatum.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/RecalUtils.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/RecalUtils.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ContextCovariate.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/ContextCovariate.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ContextCovariate.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/ContextCovariate.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/Covariate.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/Covariate.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/Covariate.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/Covariate.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ExperimentalCovariate.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/ExperimentalCovariate.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ExperimentalCovariate.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/ExperimentalCovariate.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/QualityScoreCovariate.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/QualityScoreCovariate.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/QualityScoreCovariate.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/QualityScoreCovariate.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ReadGroupCovariate.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/ReadGroupCovariate.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ReadGroupCovariate.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/ReadGroupCovariate.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatCovariate.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/RepeatCovariate.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatCovariate.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/RepeatCovariate.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatLengthCovariate.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/RepeatLengthCovariate.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatLengthCovariate.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/RepeatLengthCovariate.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatUnitAndLengthCovariate.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/RepeatUnitAndLengthCovariate.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatUnitAndLengthCovariate.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/RepeatUnitAndLengthCovariate.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatUnitCovariate.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/RepeatUnitCovariate.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatUnitCovariate.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/RepeatUnitCovariate.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RequiredCovariate.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/RequiredCovariate.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RequiredCovariate.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/RequiredCovariate.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/StandardCovariate.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/StandardCovariate.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/StandardCovariate.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/covariates/StandardCovariate.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/sam/ClippedGATKSAMRecord.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/sam/ClippedGATKSAMRecord.java similarity index 100% rename from protected/java/src/org/broadinstitute/sting/utils/sam/ClippedGATKSAMRecord.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/sam/ClippedGATKSAMRecord.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/WalkerTestIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/WalkerTestIntegrationTest.java new file mode 100644 index 000000000..1e4d6fbf2 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/WalkerTestIntegrationTest.java @@ -0,0 +1,80 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +public class WalkerTestIntegrationTest extends WalkerTest { + + public void testBadMD5(String md5) { + WalkerTestSpec spec = new WalkerTestSpec("FAIL", Arrays.asList(md5)); + executeTest("", spec); + } + + @Test(expectedExceptions = RuntimeException.class) + public void testNullMD5() { + testBadMD5(null); + } + + @Test(expectedExceptions = RuntimeException.class) + public void testBadLengthMD5() { + testBadMD5("asdfasdfa"); + } + + @Test(expectedExceptions = RuntimeException.class) + public void testSpacesMD5() { + testBadMD5("1de8e943fbf55246ebd19efa32f22a58 "); + } + + @Test(expectedExceptions = RuntimeException.class) + public void testBadCharMD5() { + testBadMD5("1de8e943fbf55246ebd19efa32f22a5_"); + } +} diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepthUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepthUnitTest.java new file mode 100644 index 000000000..a118a462d --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepthUnitTest.java @@ -0,0 +1,97 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.variant.variantcontext.*; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; + +public class QualByDepthUnitTest extends WalkerTest { + + @DataProvider(name = "UsingAD") + public Object[][] makeUsingADData() { + List tests = new ArrayList<>(); + + final Allele A = Allele.create("A", true); + final Allele C = Allele.create("C"); + final Allele G = Allele.create("G"); + + final List AA = Arrays.asList(A,A); + final List AC = Arrays.asList(A,C); + final List GG = Arrays.asList(G,G); + final List ACG = Arrays.asList(A,C,G); + + final Genotype gAC = new GenotypeBuilder("1", AC).DP(10).AD(new int[]{5,5}).make(); + final Genotype gAA = new GenotypeBuilder("2", AA).DP(10).AD(new int[]{10,0}).make(); + final Genotype gACerror = new GenotypeBuilder("3", AC).DP(10).AD(new int[]{9,1}).make(); + final Genotype gGG = new GenotypeBuilder("4", GG).DP(10).AD(new int[]{1,9}).make(); + + tests.add(new Object[]{new VariantContextBuilder("test", "20", 10, 10, AC).log10PError(-5).genotypes(Arrays.asList(gAC)).make(), 5.0}); + tests.add(new Object[]{new VariantContextBuilder("test", "20", 10, 10, AC).log10PError(-5).genotypes(Arrays.asList(gACerror)).make(), 5.0}); + tests.add(new Object[]{new VariantContextBuilder("test", "20", 10, 10, AC).log10PError(-5).genotypes(Arrays.asList(gAA, gAC)).make(), 5.0}); + tests.add(new Object[]{new VariantContextBuilder("test", "20", 10, 10, AC).log10PError(-5).genotypes(Arrays.asList(gAC, gACerror)).make(), 5.0}); + tests.add(new Object[]{new VariantContextBuilder("test", "20", 10, 10, ACG).log10PError(-5).genotypes(Arrays.asList(gAA, gAC, gACerror, gGG)).make(), 2.5}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "UsingAD") + public void testUsingAD(final VariantContext vc, final double expectedQD) { + final Map annotatedMap = new QualByDepth().annotate(null, null, null, null, vc, null); + Assert.assertNotNull(annotatedMap, vc.toString()); + final String QD = (String)annotatedMap.get("QD"); + Assert.assertEquals(Double.valueOf(QD), expectedQD, 0.0001); + } + +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/RankSumUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/RankSumUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/RankSumUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/RankSumUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java new file mode 100644 index 000000000..287cd45d0 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -0,0 +1,394 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broad.tribble.readers.LineIterator; +import org.broad.tribble.readers.PositionalBufferedStream; +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.vcf.VCFCodec; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.util.Arrays; + +public class VariantAnnotatorIntegrationTest extends WalkerTest { + + final static String REF = b37KGReference; + final static String CEUTRIO_BAM = validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam"; + + public static String baseTestString() { + return "-T VariantAnnotator -R " + b36KGReference + " --no_cmdline_in_header -o %s"; + } + + @Test + public void testHasAnnotsNotAsking1() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " --variant " + privateTestDir + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, + Arrays.asList("360610e4990860bb5c45249b8ac31e5b")); + executeTest("test file has annotations, not asking for annotations, #1", spec); + } + + @Test + public void testHasAnnotsNotAsking2() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " --variant " + privateTestDir + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, + Arrays.asList("d69a3c92a0e8f44e09e7377e3eaed4e8")); + executeTest("test file has annotations, not asking for annotations, #2", spec); + } + + @Test + public void testHasAnnotsAsking1() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, + Arrays.asList("ff21ad7bb0d6bcabcee6b95d975570fc")); + executeTest("test file has annotations, asking for annotations, #1", spec); + } + + @Test + public void testHasAnnotsAsking2() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, + Arrays.asList("cb463a56d0b5bc66940f844e56265c14")); + executeTest("test file has annotations, asking for annotations, #2", spec); + } + + @Test + public void testNoAnnotsNotAsking1() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " --variant " + privateTestDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, + Arrays.asList("540a9be8a8cb85b0f675fea1184bf78c")); + executeTest("test file doesn't have annotations, not asking for annotations, #1", spec); + } + + @Test + public void testNoAnnotsNotAsking2() { + // the genotype annotations in this file are actually out of order. If you don't parse the genotypes + // they don't get reordered. It's a good test of the genotype ordering system. + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " --variant " + privateTestDir + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, + Arrays.asList("f900e65b65ff0f9d9eb0891ef9b28c73")); + executeTest("test file doesn't have annotations, not asking for annotations, #2", spec); + } + + @Test + public void testNoAnnotsAsking1() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, + Arrays.asList("d57ca04b4ceb2f25b31bc0cbd88bca6b")); + executeTest("test file doesn't have annotations, asking for annotations, #1", spec); + } + + @Test + public void testNoAnnotsAsking2() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, + Arrays.asList("9cc0cf19070d951b1979e069552810f1")); + executeTest("test file doesn't have annotations, asking for annotations, #2", spec); + } + + @Test + public void testExcludeAnnotations() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " -G Standard -XA FisherStrand -XA ReadPosRankSumTest --variant " + privateTestDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, + Arrays.asList("552c2ad9dbfaa85d51d2def93c8229c6")); + executeTest("test exclude annotations", spec); + } + + @Test + public void testOverwritingHeader() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample4.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,001,292", 1, + Arrays.asList("0ed4c7760f6e7a158b6d743d257300f3")); + executeTest("test overwriting header", spec); + } + + @Test + public void testNoReads() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1, + Arrays.asList("1c423b7730b9805e7b885ece924286e0")); + executeTest("not passing it any reads", spec); + } + + @Test + public void testDBTagWithDbsnp() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " --dbsnp " + b36dbSNP129 + " -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1, + Arrays.asList("54d7d5bb9404652857adf5e50d995f30")); + executeTest("getting DB tag with dbSNP", spec); + } + + @Test + public void testMultipleIdsWithDbsnp() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " --alwaysAppendDbsnpId --dbsnp " + b36dbSNP129 + " -G Standard --variant " + privateTestDir + "vcfexample3withIDs.vcf -L " + privateTestDir + "vcfexample3withIDs.vcf", 1, + Arrays.asList("5fe63e511061ed4f91d938e72e7e3c39")); + executeTest("adding multiple IDs with dbSNP", spec); + } + + @Test + public void testDBTagWithHapMap() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " --comp:H3 " + privateTestDir + "fakeHM3.vcf -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1, + Arrays.asList("cc7184263975595a6e2473d153227146")); + executeTest("getting DB tag with HM3", spec); + } + + @Test + public void testDBTagWithTwoComps() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " --comp:H3 " + privateTestDir + "fakeHM3.vcf --comp:foo " + privateTestDir + "fakeHM3.vcf -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1, + Arrays.asList("6afbf05090ae139f53467cf6e0e71cf4")); + executeTest("getting DB tag with 2 comps", spec); + } + + @Test + public void testNoQuals() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " --variant " + privateTestDir + "noQual.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L " + privateTestDir + "noQual.vcf -A QualByDepth", 1, + Arrays.asList("aea983adc01cd059193538cc30adc17d")); + executeTest("test file doesn't have QUALs", spec); + } + + @Test + public void testUsingExpression() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " --resource:foo " + privateTestDir + "targetAnnotations.vcf -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -E foo.AF -L " + privateTestDir + "vcfexample3empty.vcf", 1, + Arrays.asList("2b0e8cdfd691779befc5ac123d1a1887")); + executeTest("using expression", spec); + } + + @Test + public void testUsingExpressionWithID() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " --resource:foo " + privateTestDir + "targetAnnotations.vcf -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -E foo.ID -L " + privateTestDir + "vcfexample3empty.vcf", 1, + Arrays.asList("3de1d1998203518098ffae233f3e2352")); + executeTest("using expression with ID", spec); + } + + @Test + public void testTabixAnnotationsAndParallelism() { + final String MD5 = "99938d1e197b8f10c408cac490a00a62"; + for ( String file : Arrays.asList("CEU.exon.2010_03.sites.vcf", "CEU.exon.2010_03.sites.vcf.gz")) { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " -A HomopolymerRun --variant:vcf " + validationDataLocation + file + " -L " + validationDataLocation + "CEU.exon.2010_03.sites.vcf --no_cmdline_in_header", 1, + Arrays.asList(MD5)); + executeTest("Testing lookup vcf tabix vs. vcf tribble", spec); + } + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " -A HomopolymerRun -nt 2 --variant:vcf " + validationDataLocation + "CEU.exon.2010_03.sites.vcf -L " + validationDataLocation + "CEU.exon.2010_03.sites.vcf --no_cmdline_in_header", 1, + Arrays.asList(MD5)); + + executeTest("Testing lookup vcf tabix vs. vcf tribble plus parallelism", spec); + } + + @Test + public void testSnpEffAnnotations() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T VariantAnnotator -R " + hg19Reference + " --no_cmdline_in_header -o %s -A SnpEff --variant " + + validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf --snpEffFile " + validationDataLocation + + "snpEff2.0.5.AFR.unfiltered.vcf -L 1:1-1,500,000 -L 2:232,325,429", + 1, + Arrays.asList("d9291845ce5a8576898d293a829a05b7") + ); + executeTest("Testing SnpEff annotations", spec); + } + + @Test + public void testSnpEffAnnotationsUnsupportedVersionGATKMode() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T VariantAnnotator -R " + b37KGReference + " --no_cmdline_in_header -o %s -A SnpEff " + + "--variant " + privateTestDir + "vcf4.1.example.vcf " + + "--snpEffFile " + privateTestDir + "snpEff_unsupported_version_gatk_mode.vcf " + + "-L 1:10001292-10012424", + 1, + Arrays.asList("7352cf23a4d45d3d2bb34ab44a4100ae") + ); + executeTest("Testing SnpEff annotations (unsupported version, GATK mode)", spec); + } + + @Test + public void testSnpEffAnnotationsUnsupportedVersionNoGATKMode() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T VariantAnnotator -R " + b37KGReference + " --no_cmdline_in_header -o %s -A SnpEff " + + "--variant " + privateTestDir + "vcf4.1.example.vcf " + + "--snpEffFile " + privateTestDir + "snpEff_unsupported_version_no_gatk_mode.vcf " + + "-L 1:10001292-10012424", + 1, + UserException.class + ); + executeTest("Testing SnpEff annotations (unsupported version, no GATK mode)", spec); + } + + @Test(enabled = true) + public void testTDTAnnotation() { + final String MD5 = "427dfdc665359b67eff210f909ebf8a2"; + WalkerTestSpec spec = new WalkerTestSpec( + "-T VariantAnnotator -R " + b37KGReference + " -A TransmissionDisequilibriumTest --variant:vcf " + privateTestDir + "ug.random50000.subset300bp.chr1.family.vcf" + + " -L " + privateTestDir + "ug.random50000.subset300bp.chr1.family.vcf --no_cmdline_in_header -ped " + privateTestDir + "ug.random50000.family.ped -o %s", 1, + Arrays.asList(MD5)); + executeTest("Testing TDT annotation ", spec); + } + + + @Test(enabled = true) + public void testChromosomeCountsPed() { + final String MD5 = "6b5cbedf4a8b3385edf128d81c8a46f2"; + WalkerTestSpec spec = new WalkerTestSpec( + "-T VariantAnnotator -R " + b37KGReference + " -A ChromosomeCounts --variant:vcf " + privateTestDir + "ug.random50000.subset300bp.chr1.family.vcf" + + " -L " + privateTestDir + "ug.random50000.subset300bp.chr1.family.vcf --no_cmdline_in_header -ped " + privateTestDir + "ug.random50000.family.ped -o %s", 1, + Arrays.asList(MD5)); + executeTest("Testing ChromosomeCounts annotation with PED file", spec); + } + + @Test(enabled = true) + public void testInbreedingCoeffPed() { + final String MD5 = "159a771c1deaeffb786097e106943893"; + WalkerTestSpec spec = new WalkerTestSpec( + "-T VariantAnnotator -R " + b37KGReference + " -A InbreedingCoeff --variant:vcf " + privateTestDir + "ug.random50000.subset300bp.chr1.family.vcf" + + " -L " + privateTestDir + "ug.random50000.subset300bp.chr1.family.vcf --no_cmdline_in_header -ped " + privateTestDir + "ug.random50000.family.ped -o %s", 1, + Arrays.asList(MD5)); + executeTest("Testing InbreedingCoeff annotation with PED file", spec); + } + + @Test + public void testStrandBiasBySample() throws IOException { + final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, CEUTRIO_BAM) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800"; + final WalkerTestSpec spec = new WalkerTestSpec(base, 1, Arrays.asList("")); + final File outputVCF = executeTest("testStrandBiasBySample", spec).getFirst().get(0); + + final String baseNoFS = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, CEUTRIO_BAM) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800 -XA FisherStrand -A StrandBiasBySample"; + final WalkerTestSpec specNoFS = new WalkerTestSpec(baseNoFS, 1, Arrays.asList("")); + specNoFS.disableShadowBCF(); + final File outputVCFNoFS = executeTest("testStrandBiasBySample component stand bias annotation", specNoFS).getFirst().get(0); + + final String baseAnn = String.format("-T VariantAnnotator -R %s -V %s", REF, outputVCFNoFS.getAbsolutePath()) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800 -A FisherStrand"; + final WalkerTestSpec specAnn = new WalkerTestSpec(baseAnn, 1, Arrays.asList("")); + specAnn.disableShadowBCF(); + final File outputVCFAnn = executeTest("testStrandBiasBySample re-annotation of FisherStrand", specAnn).getFirst().get(0); + + // confirm that the FisherStrand values are identical for the two pipelines + final VCFCodec codec = new VCFCodec(); + final FileInputStream s = new FileInputStream(outputVCF); + final LineIterator lineIterator = codec.makeSourceFromStream(new PositionalBufferedStream(s)); + codec.readHeader(lineIterator); + + final VCFCodec codecAnn = new VCFCodec(); + final FileInputStream sAnn = new FileInputStream(outputVCFAnn); + final LineIterator lineIteratorAnn = codecAnn.makeSourceFromStream(new PositionalBufferedStream(sAnn)); + codecAnn.readHeader(lineIteratorAnn); + + while( lineIterator.hasNext() && lineIteratorAnn.hasNext() ) { + final String line = lineIterator.next(); + Assert.assertFalse(line == null); + final VariantContext vc = codec.decode(line); + + final String lineAnn = lineIteratorAnn.next(); + Assert.assertFalse(lineAnn == null); + final VariantContext vcAnn = codecAnn.decode(lineAnn); + + Assert.assertTrue(vc.hasAttribute("FS")); + Assert.assertTrue(vcAnn.hasAttribute("FS")); + Assert.assertEquals(vc.getAttributeAsDouble("FS", 0.0), vcAnn.getAttributeAsDouble("FS", -1.0)); + } + + Assert.assertFalse(lineIterator.hasNext()); + Assert.assertFalse(lineIteratorAnn.hasNext()); + } + + @Test + public void testQualByDepth() throws IOException { + final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, CEUTRIO_BAM) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800"; + final WalkerTestSpec spec = new WalkerTestSpec(base, 1, Arrays.asList("")); + final File outputVCF = executeTest("testQualByDepth", spec).getFirst().get(0); + + final String baseNoQD = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, CEUTRIO_BAM) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800 -XA QualByDepth"; + final WalkerTestSpec specNoQD = new WalkerTestSpec(baseNoQD, 1, Arrays.asList("")); + specNoQD.disableShadowBCF(); + final File outputVCFNoQD = executeTest("testQualByDepth calling without QD", specNoQD).getFirst().get(0); + + final String baseAnn = String.format("-T VariantAnnotator -R %s -V %s", REF, outputVCFNoQD.getAbsolutePath()) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800 -A QualByDepth"; + final WalkerTestSpec specAnn = new WalkerTestSpec(baseAnn, 1, Arrays.asList("4ccdbebcfd02be87ae5b4ad94666f011")); + specAnn.disableShadowBCF(); + final File outputVCFAnn = executeTest("testQualByDepth re-annotation of QD", specAnn).getFirst().get(0); + + // confirm that the QD values are present in the new file for all biallelic variants + // QD values won't be identical because some filtered reads are missing during re-annotation + + final VCFCodec codec = new VCFCodec(); + final FileInputStream s = new FileInputStream(outputVCF); + final LineIterator lineIterator = codec.makeSourceFromStream(new PositionalBufferedStream(s)); + codec.readHeader(lineIterator); + + final VCFCodec codecAnn = new VCFCodec(); + final FileInputStream sAnn = new FileInputStream(outputVCFAnn); + final LineIterator lineIteratorAnn = codecAnn.makeSourceFromStream(new PositionalBufferedStream(sAnn)); + codecAnn.readHeader(lineIteratorAnn); + + while( lineIterator.hasNext() && lineIteratorAnn.hasNext() ) { + final String line = lineIterator.next(); + Assert.assertFalse(line == null); + final VariantContext vc = codec.decode(line); + + final String lineAnn = lineIteratorAnn.next(); + Assert.assertFalse(lineAnn == null); + final VariantContext vcAnn = codecAnn.decode(lineAnn); + + Assert.assertTrue(vc.hasAttribute("QD")); + Assert.assertTrue(vcAnn.hasAttribute("QD")); + } + + Assert.assertFalse(lineIterator.hasNext()); + Assert.assertFalse(lineIteratorAnn.hasNext()); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantOverlapAnnotatorUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/VariantOverlapAnnotatorUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantOverlapAnnotatorUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/VariantOverlapAnnotatorUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/AnalyzeCovariatesIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/bqsr/AnalyzeCovariatesIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/AnalyzeCovariatesIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/bqsr/AnalyzeCovariatesIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfoUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfoUnitTest.java new file mode 100644 index 000000000..39cf719dd --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfoUnitTest.java @@ -0,0 +1,136 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import net.sf.samtools.SAMUtils; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.recalibration.EventType; +import org.broadinstitute.sting.utils.recalibration.ReadCovariates; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.EnumMap; +import java.util.List; + +public final class ReadRecalibrationInfoUnitTest extends BaseTest { + @BeforeMethod + public void init() { + ReadCovariates.clearKeysCache(); + } + + @DataProvider(name = "InfoProvider") + public Object[][] createCombineTablesProvider() { + List tests = new ArrayList(); + + for ( final int readLength: Arrays.asList(10, 100, 1000) ) { + for ( final boolean includeIndelErrors : Arrays.asList(true, false) ) { + tests.add(new Object[]{readLength, includeIndelErrors}); + } + } + + return tests.toArray(new Object[][]{}); + } + @Test(dataProvider = "InfoProvider") + public void testReadInfo(final int readLength, final boolean includeIndelErrors) { + final ReadCovariates covariates = new ReadCovariates(readLength, 2); + + final byte[] bases = new byte[readLength]; + final byte[] baseQuals = new byte[readLength]; + final byte[] insertionQuals = new byte[readLength]; + final byte[] deletionQuals = new byte[readLength]; + final boolean[] skips = new boolean[readLength]; + final double[] snpErrors = new double[readLength]; + final double[] insertionErrors = new double[readLength]; + final double[] deletionsErrors = new double[readLength]; + for ( int i = 0; i < readLength; i++ ) { + bases[i] = 'A'; + baseQuals[i] = (byte)(i % SAMUtils.MAX_PHRED_SCORE); + insertionQuals[i] = (byte)((i+1) % SAMUtils.MAX_PHRED_SCORE); + deletionQuals[i] = (byte)((i+2) % SAMUtils.MAX_PHRED_SCORE); + skips[i] = i % 2 == 0; + snpErrors[i] = 1.0 / (i+1); + insertionErrors[i] = 0.5 / (i+1); + deletionsErrors[i] = 0.3 / (i+1); + } + + final EnumMap errors = new EnumMap(EventType.class); + errors.put(EventType.BASE_SUBSTITUTION, snpErrors); + errors.put(EventType.BASE_INSERTION, insertionErrors); + errors.put(EventType.BASE_DELETION, deletionsErrors); + + final EnumMap quals = new EnumMap(EventType.class); + quals.put(EventType.BASE_SUBSTITUTION, baseQuals); + quals.put(EventType.BASE_INSERTION, insertionQuals); + quals.put(EventType.BASE_DELETION, deletionQuals); + + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, baseQuals, readLength + "M"); + if ( includeIndelErrors ) { + read.setBaseQualities(insertionQuals, EventType.BASE_INSERTION); + read.setBaseQualities(deletionQuals, EventType.BASE_DELETION); + } + + final ReadRecalibrationInfo info = new ReadRecalibrationInfo(read, covariates, skips, snpErrors, insertionErrors, deletionsErrors); + + Assert.assertEquals(info.getCovariatesValues(), covariates); + Assert.assertEquals(info.getRead(), read); + + for ( int i = 0; i < readLength; i++ ) { + Assert.assertEquals(info.skip(i), skips[i]); + for ( final EventType et : EventType.values() ) { + Assert.assertEquals(info.getErrorFraction(et, i), errors.get(et)[i]); + final byte expectedQual = et == EventType.BASE_SUBSTITUTION || includeIndelErrors ? quals.get(et)[i]: GATKSAMRecord.DEFAULT_INSERTION_DELETION_QUAL; + Assert.assertEquals(info.getQual(et, i), expectedQual); + } + } + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/BaseCoverageDistributionIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diagnostics/BaseCoverageDistributionIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/BaseCoverageDistributionIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diagnostics/BaseCoverageDistributionIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycleIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycleIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycleIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycleIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupPropertiesIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupPropertiesIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupPropertiesIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupPropertiesIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargetsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargetsIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargetsIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargetsIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStatisticsUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStatisticsUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStatisticsUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStatisticsUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervalsUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervalsUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervalsUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervalsUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngineUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngineUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngineUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngineUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffNodeUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diffengine/DiffNodeUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffNodeUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diffengine/DiffNodeUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReaderUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReaderUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReaderUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReaderUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DifferenceUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diffengine/DifferenceUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DifferenceUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/diffengine/DifferenceUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ArtificialReadPileupTestProvider.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/ArtificialReadPileupTestProvider.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ArtificialReadPileupTestProvider.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/ArtificialReadPileupTestProvider.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/BiasedDownsamplingIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/BiasedDownsamplingIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/BiasedDownsamplingIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/BiasedDownsamplingIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java new file mode 100644 index 000000000..8f71c35be --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java @@ -0,0 +1,84 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import static org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperGeneralPloidyTestExecutor.LSV_ALLELES; + +/** + * Created by IntelliJ IDEA. + * User: delangel + * Date: 4/5/12 + * Time: 11:28 AM + * To change this template use File | Settings | File Templates. + */ +public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTest { + + private final UnifiedGenotyperGeneralPloidyTestExecutor executor = new UnifiedGenotyperGeneralPloidyTestExecutor(); + + @Test(enabled = true) + public void testSNP_ACS_Pools() { + executor.PC_LSV_Test_short(" -maxAltAlleles 1 -ploidy 6 -out_mode EMIT_ALL_CONFIDENT_SITES", "LSV_SNP_ACS", "SNP", "df0e67c975ef74d593f1c704daab1705"); + } + + @Test(enabled = true) + public void testBOTH_GGA_Pools() { + executor.PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_BOTH_GGA", "BOTH", "c2932cc77611f13cc8a14e87d055a8f8"); + } + + @Test(enabled = true) + public void testINDEL_GGA_Pools() { + executor.PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_INDEL_GGA", "INDEL", "a0648992f049ed59fab0ef753d2d0c03"); + } + + @Test(enabled = true) + public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() { + executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "fcfe18bd4c6087b21959d3c31ec177da"); + } +} diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java new file mode 100644 index 000000000..e16ca154f --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java @@ -0,0 +1,73 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import static org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperGeneralPloidyTestExecutor.CEUTRIO_BAM; +import static org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperGeneralPloidyTestExecutor.NA12891_CALLS; + +public class UnifiedGenotyperGeneralPloidySuite2IntegrationTest extends WalkerTest { + + private final UnifiedGenotyperGeneralPloidyTestExecutor executor = new UnifiedGenotyperGeneralPloidyTestExecutor(); + + @Test(enabled = true) + public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() { + executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","ef7a6ee4ec7e20e5ce28fc50d3362d3d"); + } + + @Test(enabled = true) + public void testMT_SNP_DISCOVERY_sp4() { + executor.PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","fc75733fcdd8079e7f7743961a1f36be"); + } + + @Test(enabled = true) + public void testMT_SNP_GGA_sp10() { + executor.PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "86cdfc291f995036658bfc10773db107"); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyTestExecutor.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyTestExecutor.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyTestExecutor.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyTestExecutor.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java new file mode 100644 index 000000000..8b8c82ea6 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java @@ -0,0 +1,207 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { + + private final static String baseCommandIndels = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm INDEL -mbq 20 -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; + private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132; + + // -------------------------------------------------------------------------------------------------------------- + // + // testing indel caller + // + // -------------------------------------------------------------------------------------------------------------- + // Basic indel testing with SLX data + @Test + public void testSimpleIndels() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommandIndels + + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam" + + " -o %s" + + " -L 1:10,000,000-10,500,000", + 1, + Arrays.asList("bb8c1b2e9343c79133d8efb51ec2192e")); + executeTest(String.format("test indel caller in SLX"), spec); + } + + // Basic indel testing with SLX data + @Test + public void testIndelsWithLowMinAlleleCnt() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommandIndels + + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam" + + " -o %s" + + " -minIndelCnt 1" + + " -L 1:10,000,000-10,100,000", + 1, + Arrays.asList("9b4ead3da021763704fcb9d80a5ee6ff")); + + executeTest(String.format("test indel caller in SLX with low min allele count"), spec); + } + + @Test + public void testMultiTechnologyIndels() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommandIndels + + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" + + " -o %s" + + " -L 1:10,000,000-10,500,000", + 1, + Arrays.asList("f5e5148cac1526136f9f2559fe3b49fa")); + + executeTest(String.format("test indel calling, multiple technologies"), spec); + } + + @Test + public void testWithIndelAllelesPassedIn1() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, + Arrays.asList("209db887bfe1aac8bd62544aa8afa2b5")); + executeTest("test MultiSample Pilot2 indels with alleles passed in", spec); + } + + @Test + public void testWithIndelAllelesPassedIn2() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, + Arrays.asList("83b32ea956809654590abd5e0c029d4d")); + executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec); + } + + @Test(timeOut = 20*1000*60) // this guy can take a long time because it's two steps, so give it 12 minutes + public void testMultiSampleIndels1() { + // since we're going to test the MD5s with GGA only do one here + WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( + baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, + Arrays.asList("")); + List result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst(); + + WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( + baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1, + Arrays.asList("25815c1968450ddd009b983d65809c50")); + executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); + } + + @Test + public void testGGAwithNoEvidenceInReads() { + final String vcf = "small.indel.test.vcf"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + privateTestDir + vcf + " -I " + validationDataLocation + + "NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam -o %s -L " + validationDataLocation + vcf, 1, + Arrays.asList("d76eacc4021b78ccc0a9026162e814a7")); + executeTest("test GENOTYPE_GIVEN_ALLELES with no evidence in reads", spec); + } + + @Test + public void testBaseIndelQualityScores() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommandIndelsb37 + + " -I " + privateTestDir + "NA12878.100kb.BQSRv2.example.bam" + + " -o %s" + + " -L 20:10,000,000-10,100,000", + 1, + Arrays.asList("8a7966e4b67334bca6083670c5a16b67")); + + executeTest(String.format("test UG with base indel quality scores"), spec); + } + + // -------------------------------------------------------------------------------------------------------------- + // + // testing MinIndelFraction + // + // -------------------------------------------------------------------------------------------------------------- + + final static String assessMinIndelFraction = baseCommandIndelsb37 + " -I " + validationDataLocation + + "978604.bam -L 1:978,586-978,626 -o %s --sites_only -rf Sample -goodSM 7377 -goodSM 22-0022 -goodSM 134 -goodSM 344029-53 -goodSM 14030"; + + @Test + public void testMinIndelFraction0() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + assessMinIndelFraction + " -minIndelFrac 0.0", 1, + Arrays.asList("af0b881d0a931f0789706f0289b72a64")); + executeTest("test minIndelFraction 0.0", spec); + } + + @Test + public void testMinIndelFraction25() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + assessMinIndelFraction + " -minIndelFrac 0.25", 1, + Arrays.asList("aa97a7941a861d57a3b746b3f6301eb6")); + executeTest("test minIndelFraction 0.25", spec); + } + + @Test + public void testMinIndelFraction100() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + assessMinIndelFraction + " -minIndelFrac 1", 1, + Arrays.asList("3f07efb768e08650a7ce333edd4f9a52")); + executeTest("test minIndelFraction 1.0", spec); + } + + // No testing of MD5 here, we previously blew up due to a 0 length haplotypes, so we just need to pass + @Test + public void testHaplotype0Length() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper --disableDithering -I " + privateTestDir + "haplotype0.bam -L 20:47507681 -R " + b37KGReference + " -baq CALCULATE_AS_NECESSARY -glm BOTH -o /dev/null", + 0, + Collections.emptyList()); + executeTest("testHaplotype0Length", spec); + } +} diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java new file mode 100644 index 000000000..ecfda9d8a --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -0,0 +1,385 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import net.sf.samtools.util.BlockCompressedInputStream; +import org.broad.tribble.readers.AsciiLineReader; +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.variant.variantcontext.Genotype; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +// ********************************************************************************** // +// Note that this class also serves as an integration test for the VariantAnnotator! // +// ********************************************************************************** // + +public class UnifiedGenotyperIntegrationTest extends WalkerTest { + + private final static String baseCommand = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; + private final static String baseCommandNoCmdLineHeaderStdout = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam"; + + // -------------------------------------------------------------------------------------------------------------- + // + // testing parameters + // + // -------------------------------------------------------------------------------------------------------------- + + @Test + public void testMinBaseQualityScore() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --min_base_quality_score 26", 1, + Arrays.asList("30be17df00acc8a92223f51fe7c1bdf7")); + executeTest("test min_base_quality_score 26", spec); + } + + @Test + public void testSLOD() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --computeSLOD --no_cmdline_in_header -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, + Arrays.asList("bc8a4e4ceb46776169b47146805c882a")); + executeTest("test SLOD", spec); + } + + @Test + public void testNDA() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommand + " --annotateNDA -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, + Arrays.asList("17f65eca1e6c1f06919a58f230b6d8d3")); + executeTest("test NDA", spec); + } + + @Test + public void testCompTrack() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -comp:FOO " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, + Arrays.asList("21185d9a7519356ba672757f5a522971")); + executeTest("test using comp track", spec); + } + + @Test(enabled = false) // EB: for some reason this test crashes whenever I run it on my local machine + public void testNoCmdLineHeaderStdout() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommandNoCmdLineHeaderStdout + " -glm INDEL -L 1:67,225,396-67,288,518", 0, + Collections.emptyList()); + executeTest("testNoCmdLineHeaderStdout", spec); + } + + @Test + public void testOutputParameterSitesOnly() { + testOutputParameters("-sites_only", "48cd40d3994911a6f2609bfd375e1d2d"); + } + + @Test + public void testOutputParameterAllConfident() { + testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "28f40ce47651f504158fc4e5bb58df4b"); + } + + @Test + public void testOutputParameterAllSites() { + testOutputParameters("--output_mode EMIT_ALL_SITES", "5259dafaa1b57d9489003b16a48e35f8"); + } + + private void testOutputParameters(final String args, final String md5) { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 " + args, 1, + Arrays.asList(md5)); + executeTest(String.format("testParameter[%s]", args), spec); + } + + @Test + public void testConfidence() { + WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( + baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1, + Arrays.asList("918109938ef355d759dafc3ebb47d8a5")); + executeTest("test confidence 1", spec1); + } + + @Test + public void testNoPrior() { + WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( + baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -inputPrior 0.33333 -inputPrior 0.33333", 1, + Arrays.asList("9ee4f1ee1827a6726bfac1220a6a7c40")); + executeTest("test no prior 1", spec1); + + } + @Test + public void testUserPrior() { + WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( + baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -inputPrior 0.001 -inputPrior 0.495", 1, + Arrays.asList("04d05900849d5a3f6f3f98bd0f262369")); + executeTest("test user prior 1", spec1); + + } + + @Test + public void emitPLsAtAllSites() { + WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( + baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --output_mode EMIT_ALL_SITES -allSitePLs", 1, + Arrays.asList("85dee5da72c4154e130527c4e6329c07")); + // GDA: TODO: BCF encoder/decoder doesn't seem to support non-standard values in genotype fields. IE even if there is a field defined in FORMAT and in the header the BCF2 encoder will still fail + spec1.disableShadowBCF(); + + executeTest("test all site PLs 1", spec1); + + } + // -------------------------------------------------------------------------------------------------------------- + // + // testing heterozygosity + // + // -------------------------------------------------------------------------------------------------------------- + @Test + public void testHeterozyosity1() { + testHeterozosity( 0.01, "6053106407e09a6aefb78395a0e22ec4" ); + } + + @Test + public void testHeterozyosity2() { + testHeterozosity( 1.0 / 1850, "37666375278259c4d7dc800a0f73c1ca" ); + } + + private void testHeterozosity(final double arg, final String md5) { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000 --heterozygosity " + arg, 1, + Arrays.asList(md5)); + executeTest(String.format("test heterozyosity[%s]", arg), spec); + } + + // -------------------------------------------------------------------------------------------------------------- + // + // testing compressed output + // + // -------------------------------------------------------------------------------------------------------------- + + private final static String COMPRESSED_OUTPUT_MD5 = "c5c6af421cffa12fe6bdaced6cd41dd2"; + + @Test + public void testCompressedOutput() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, + Arrays.asList("gz"), Arrays.asList(COMPRESSED_OUTPUT_MD5)); + executeTest("test compressed output", spec); + } + + // -------------------------------------------------------------------------------------------------------------- + // + // testing parallelization + // + // -------------------------------------------------------------------------------------------------------------- + + @Test + public void testParallelization() { + + // Note that we need to turn off any randomization for this to work, so no downsampling and no annotations + + String md5 = "1f3fad09a63269c36e871e7ee04ebfaa"; + final String myCommand = "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; + + WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( + myCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1, + Arrays.asList(md5)); + executeTest("test parallelization (single thread)", spec1); + + GenomeAnalysisEngine.resetRandomGenerator(); + + WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( + myCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000 -nt 2", 1, + Arrays.asList(md5)); + executeTest("test parallelization (2 threads)", spec2); + + GenomeAnalysisEngine.resetRandomGenerator(); + + WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( + myCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000 -nt 4", 1, + Arrays.asList(md5)); + executeTest("test parallelization (4 threads)", spec3); + } + + // -------------------------------------------------------------------------------------------------------------- + // + // testing calls with SLX, 454, and SOLID data + // + // -------------------------------------------------------------------------------------------------------------- + @Test + public void testMultiTechnologies() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommand + + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" + + " -o %s" + + " -L 1:10,000,000-10,100,000", + 1, + Arrays.asList("630d1dcfb7650a9287d6723c38b0746a")); + + executeTest(String.format("test multiple technologies"), spec); + } + + // -------------------------------------------------------------------------------------------------------------- + // + // testing calls with BAQ + // + // -------------------------------------------------------------------------------------------------------------- + @Test + public void testCallingWithBAQ() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommand + + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" + + " -o %s" + + " -L 1:10,000,000-10,100,000" + + " -baq CALCULATE_AS_NECESSARY", + 1, + Arrays.asList("976e88e4accb4436ad9ac97df9477648")); + + executeTest(String.format("test calling with BAQ"), spec); + } + + // -------------------------------------------------------------------------------------------------------------- + // + // testing SnpEff + // + // -------------------------------------------------------------------------------------------------------------- + + @Test + public void testSnpEffAnnotationRequestedWithoutRodBinding() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000 " + + "-A SnpEff", + 1, + UserException.class); + executeTest("testSnpEffAnnotationRequestedWithoutRodBinding", spec); + } + + // -------------------------------------------------------------------------------------------------------------- + // + // testing Ns in CIGAR + // + // -------------------------------------------------------------------------------------------------------------- + + @Test + public void testNsInCigar() { + final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "testWithNs.bam -o %s -L 8:141813600-141813700 -out_mode EMIT_ALL_SITES", 1, + UserException.UnsupportedCigarOperatorException.class); + + executeTest("test calling on reads with Ns in CIGAR", spec); + } + + @Test(enabled = true) + public void testCompressedVCFOutputWithNT() throws Exception { + WalkerTestSpec spec = new WalkerTestSpec("-T UnifiedGenotyper -R " + b37KGReference + " -I " + + privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam" + + " -o %s -L 20:10,000,000-10,100,000 -nt 4", + 1, Arrays.asList("vcf.gz"), Arrays.asList("")); + final File vcf = executeTest("testCompressedVCFOutputWithNT", spec).first.get(0); + final AsciiLineReader reader = new AsciiLineReader(new BlockCompressedInputStream(vcf)); + int nLines = 0; + while ( reader.readLine() != null ) + nLines++; + Assert.assertTrue(nLines > 0); + } + + // -------------------------------------------------------------------------------------------------------------- + // + // testing only emit samples + // + // -------------------------------------------------------------------------------------------------------------- + + @Test(enabled = true) + public void testOnlyEmitSample() throws Exception { + final String base = "-T UnifiedGenotyper -R " + b37KGReference + " -I " + + privateTestDir + "AFR.complex.variants.bam --disableDithering" + + " -o %s -L 20:10,000,000-10,100,000"; + final WalkerTestSpec specAllSamples = new WalkerTestSpec(base, 1, Arrays.asList("")); + specAllSamples.disableShadowBCF(); + final File allSamplesVCF = executeTest("testOnlyEmitSampleAllSamples", specAllSamples).first.get(0); + final List allSampleVCs = GATKVCFUtils.readVCF(allSamplesVCF).getSecond(); + + final WalkerTestSpec onlyHG01879 = new WalkerTestSpec(base + " -onlyEmitSamples HG01879", 1, Arrays.asList("")); + onlyHG01879.disableShadowBCF(); + final File onlyHG01879VCF = executeTest("testOnlyEmitSample", onlyHG01879).first.get(0); + final List onlyHG01879VCs = GATKVCFUtils.readVCF(onlyHG01879VCF).getSecond(); + + Assert.assertEquals(allSampleVCs.size(), onlyHG01879VCs.size()); + for ( int i = 0; i < allSampleVCs.size(); i++ ) { + final VariantContext allSampleVC = allSampleVCs.get(i); + final VariantContext onlyHG01879VC = onlyHG01879VCs.get(i); + + if ( allSampleVC == null ) { + Assert.assertNull(onlyHG01879VC); + } else { + Assert.assertNotNull(onlyHG01879VC); + + Assert.assertTrue(allSampleVC.getGenotypes().size() > 1, "All samples should have had more than 1 genotype, but didn't"); + Assert.assertEquals(onlyHG01879VC.getGenotypes().size(), 1, "Should have found a single sample genotype, but didn't"); + Assert.assertEquals(onlyHG01879VC.hasGenotype("HG01879"), true); + + Assert.assertEquals(allSampleVC.getStart(), onlyHG01879VC.getStart()); + Assert.assertEquals(allSampleVC.getChr(), onlyHG01879VC.getChr()); + Assert.assertEquals(allSampleVC.getEnd(), onlyHG01879VC.getEnd()); + Assert.assertEquals(allSampleVC.getFilters(), onlyHG01879VC.getFilters()); + Assert.assertEquals(allSampleVC.getAlleles(), onlyHG01879VC.getAlleles()); + Assert.assertEquals(allSampleVC.getAttributes(), onlyHG01879VC.getAttributes()); + Assert.assertEquals(allSampleVC.getPhredScaledQual(), onlyHG01879VC.getPhredScaledQual()); + + final Genotype allG = allSampleVC.getGenotype("HG01879"); + final Genotype onlyG = onlyHG01879VC.getGenotype("HG01879"); + Assert.assertEquals(allG.getAD(), onlyG.getAD()); + Assert.assertEquals(allG.getDP(), onlyG.getDP()); + Assert.assertEquals(allG.getAlleles(), onlyG.getAlleles()); + Assert.assertEquals(allG.getPL(), onlyG.getPL()); + Assert.assertEquals(allG.toString(), onlyG.toString()); + } + } + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperLargeScaleTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperLargeScaleTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperLargeScaleTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperLargeScaleTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java new file mode 100644 index 000000000..29b93e427 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java @@ -0,0 +1,126 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ + + private final static String baseCommand = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; + + // -------------------------------------------------------------------------------------------------------------- + // + // testing normal calling + // + // -------------------------------------------------------------------------------------------------------------- + @Test + public void testMultiSamplePilot1() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, + Arrays.asList("03ff28802a2e06e0a623d9a5df66d237")); + executeTest("test MultiSample Pilot1", spec); + } + + @Test + public void testWithAllelesPassedIn1() { + WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( + baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, + Arrays.asList("85d0e5c086dc642d55124f0e88e7326b")); + executeTest("test MultiSample Pilot2 with alleles passed in", spec1); + } + + @Test + public void testWithAllelesPassedIn2() { + WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( + baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, + Arrays.asList("11783a280df9bf621840c300edd0401a")); + executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2); + } + + @Test + public void testSingleSamplePilot2() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, + Arrays.asList("75503fce7521378f8c2170094aff29df")); + executeTest("test SingleSample Pilot2", spec); + } + + @Test + public void testMultipleSNPAlleles() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, + Arrays.asList("eac8b071bd2fa89889d51de8be84624a")); + executeTest("test Multiple SNP alleles", spec); + } + + @Test + public void testBadRead() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH -I " + privateTestDir + "badRead.test.bam -o %s -L 1:22753424-22753464", 1, + Arrays.asList("d915535c1458733f09f82670092fcab6")); + executeTest("test bad read", spec); + } + + @Test + public void testReverseTrim() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, + Arrays.asList("7f912aa5166f6ed16166daac1e5c0935")); + executeTest("test reverse trim", spec); + } + + @Test + public void testMismatchedPLs() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, + Arrays.asList("ab22f70f5c65d45f9754e7064e5a152c")); + executeTest("test mismatched PLs", spec); + } +} diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java new file mode 100644 index 000000000..df749231e --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java @@ -0,0 +1,87 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +public class UnifiedGenotyperReducedReadsIntegrationTest extends WalkerTest { + + // -------------------------------------------------------------------------------------------------------------- + // + // testing reduced reads + // + // -------------------------------------------------------------------------------------------------------------- + + @Test + public void testReducedBam() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, + Arrays.asList("ffde0d5e23523e4bd9e7e18f62d37d0f")); + executeTest("test calling on a ReducedRead BAM", spec); + } + + @Test + public void testReducedBamSNPs() { + testReducedCalling("SNP", "cc0508b18028f2e84e6a42c1ff23721c"); + } + + @Test + public void testReducedBamINDELs() { + testReducedCalling("INDEL", "6fc00d5299b1bf334d39634c3409a69d"); + } + + + private void testReducedCalling(final String model, final String md5) { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.reduced.bam -o %s -L 20:10,000,000-10,500,000 -glm " + model, 1, + Arrays.asList(md5)); + executeTest("test calling on a ReducedRead BAM with " + model, spec); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java new file mode 100644 index 000000000..c9476f7eb --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java @@ -0,0 +1,230 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.Genotype; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.VariantContextBuilder; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + + +// SEE private/R/pls.R if you want the truth output for these tests +public class IndependentAllelesDiploidExactAFCalcUnitTest extends BaseTest { + @DataProvider(name = "TestCombineGLs") + public Object[][] makeTestCombineGLs() { + List tests = new ArrayList(); + + tests.add(new Object[]{1, 1, makePL( 0, 10, 20), makePL( 0, 10, 20)}); + tests.add(new Object[]{1, 1, makePL(10, 0, 20), makePL(10, 0, 20)}); + tests.add(new Object[]{1, 1, makePL(20, 10, 0), makePL(20, 10, 0)}); + + // AA AB BB AC BC CC => AA AB+BC CC + tests.add(new Object[]{1, 2, makePL( 0, 10, 20, 30, 40, 50), makePL(0, 10, 20)}); + tests.add(new Object[]{2, 2, makePL( 0, 10, 20, 30, 40, 50), makePL(0, 30, 50)}); + + tests.add(new Object[]{1, 2, makePL( 0, 10, 10, 10, 10, 10), makePL(0, 8, 11)}); + tests.add(new Object[]{2, 2, makePL( 0, 10, 10, 10, 10, 10), makePL(0, 8, 11)}); + + tests.add(new Object[]{1, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 2, 5)}); + tests.add(new Object[]{2, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 4, 9)}); + + tests.add(new Object[]{1, 2, makePL( 0, 50, 50, 50, 50, 50), makePL( 0, 47, 50)}); + tests.add(new Object[]{2, 2, makePL( 0, 50, 50, 50, 50, 50), makePL( 0, 47, 50)}); + + tests.add(new Object[]{1, 2, makePL( 50, 0, 50, 50, 50, 50), makePL(45, 0, 50)}); + tests.add(new Object[]{2, 2, makePL( 50, 0, 50, 50, 50, 50), makePL( 0, 47, 50)}); + + tests.add(new Object[]{1, 2, makePL( 50, 50, 0, 50, 50, 50), makePL(45, 47, 0)}); + tests.add(new Object[]{2, 2, makePL( 50, 50, 0, 50, 50, 50), makePL( 0, 47, 50)}); + + tests.add(new Object[]{1, 2, makePL( 50, 50, 50, 0, 50, 50), makePL(0, 47, 50)}); + tests.add(new Object[]{2, 2, makePL( 50, 50, 50, 0, 50, 50), makePL(45, 0, 50)}); + + tests.add(new Object[]{1, 2, makePL( 50, 50, 50, 50, 0, 50), makePL(45, 0, 50)}); + tests.add(new Object[]{2, 2, makePL( 50, 50, 50, 50, 0, 50), makePL(45, 0, 50)}); + + tests.add(new Object[]{1, 2, makePL( 50, 50, 50, 50, 50, 0), makePL(0, 47, 50)}); + tests.add(new Object[]{2, 2, makePL( 50, 50, 50, 50, 50, 0), makePL(45, 47, 0)}); + + return tests.toArray(new Object[][]{}); + } + + private Genotype makePL(final int ... PLs) { + return AFCalcUnitTest.makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), PLs); + } + + @Test(enabled = true, dataProvider = "TestCombineGLs") + public void testCombineGLsPrecise(final int altIndex, final int nAlts, final Genotype testg, final Genotype expected) { + final IndependentAllelesDiploidExactAFCalc calc = (IndependentAllelesDiploidExactAFCalc)AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 1, 4); + final Genotype combined = calc.combineGLsPrecise(testg, altIndex, nAlts); + + Assert.assertEquals(combined.getPL(), expected.getPL(), + "Combined PLs " + Utils.join(",", combined.getPL()) + " != expected " + Utils.join(",", expected.getPL())); + } + + @Test(enabled = true, dataProvider = "TestCombineGLs") + public void testCombinePrecise(final int altIndex, final int nAlts, final Genotype testg, final Genotype expected) { + final IndependentAllelesDiploidExactAFCalc calc = (IndependentAllelesDiploidExactAFCalc)AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 1, 4); + final Genotype combined = calc.combineGLsPrecise(testg, altIndex, nAlts); + + Assert.assertEquals(combined.getPL(), expected.getPL(), + "Combined PLs " + Utils.join(",", combined.getPL()) + " != expected " + Utils.join(",", expected.getPL())); + } + + static Allele A = Allele.create("A", true); + static Allele C = Allele.create("C"); + static Allele G = Allele.create("G"); + + @DataProvider(name = "TestMakeAlleleConditionalContexts") + public Object[][] makeTestMakeAlleleConditionalContexts() { + List tests = new ArrayList(); + + final VariantContextBuilder root = new VariantContextBuilder("x", "1", 1, 1, Arrays.asList(A)); + final VariantContextBuilder vcAC = new VariantContextBuilder(root).alleles(Arrays.asList(A, C)); + final VariantContextBuilder vcAG = new VariantContextBuilder(root).alleles(Arrays.asList(A, G)); + final VariantContextBuilder vcACG = new VariantContextBuilder(root).alleles(Arrays.asList(A, C, G)); + final VariantContextBuilder vcAGC = new VariantContextBuilder(root).alleles(Arrays.asList(A, G, C)); + + final Genotype gACG = makePL( 0, 1, 2, 3, 4, 5); + final Genotype gAGC = makePL( 0, 4, 5, 1, 3, 2); + final Genotype gACcombined = makePL(0, 2, 5); + final Genotype gACcombined2 = makePL(0, 1, 4); + final Genotype gAGcombined = makePL(0, 4, 9); + + // biallelic + tests.add(new Object[]{vcAC.genotypes(gACcombined).make(), Arrays.asList(vcAC.genotypes(gACcombined).make())}); + + // tri-allelic + tests.add(new Object[]{vcACG.genotypes(gACG).make(), Arrays.asList(vcAC.genotypes(gACcombined).make(), vcAG.genotypes(gAGcombined).make())}); + tests.add(new Object[]{vcAGC.genotypes(gAGC).make(), Arrays.asList(vcAG.genotypes(gAGcombined).make(), vcAC.genotypes(gACcombined2).make())}); + + return tests.toArray(new Object[][]{}); + } + + + @Test(enabled = true, dataProvider = "TestMakeAlleleConditionalContexts") + private void testMakeAlleleConditionalContexts(final VariantContext vc, final List expectedVCs) { + final IndependentAllelesDiploidExactAFCalc calc = (IndependentAllelesDiploidExactAFCalc)AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 1, 4); + final List biAllelicVCs = calc.makeAlleleConditionalContexts(vc); + + Assert.assertEquals(biAllelicVCs.size(), expectedVCs.size()); + + for ( int i = 0; i < biAllelicVCs.size(); i++ ) { + final VariantContext actual = biAllelicVCs.get(i); + final VariantContext expected = expectedVCs.get(i); + Assert.assertEquals(actual.getAlleles(), expected.getAlleles()); + + for ( int j = 0; j < actual.getNSamples(); j++ ) + Assert.assertEquals(actual.getGenotype(j).getPL(), expected.getGenotype(j).getPL(), + "expected PLs " + Utils.join(",", expected.getGenotype(j).getPL()) + " not equal to actual " + Utils.join(",", actual.getGenotype(j).getPL())); + } + } + + + @DataProvider(name = "ThetaNTests") + public Object[][] makeThetaNTests() { + List tests = new ArrayList(); + + final List log10LAlleles = Arrays.asList(0.0, -1.0, -2.0, -3.0, -4.0); + + for ( final double log10pRef : Arrays.asList(-1, -2, -3) ) { + for ( final int ploidy : Arrays.asList(1, 2, 3, 4) ) { + for ( List permutations : Utils.makePermutations(log10LAlleles, ploidy, true)) { + tests.add(new Object[]{permutations, Math.pow(10, log10pRef)}); + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "ThetaNTests") + public void testThetaNTests(final List log10LAlleles, final double pRef) { + // biallelic + final double[] rawPriors = MathUtils.toLog10(new double[]{pRef, 1-pRef}); + + final double log10pNonRef = Math.log10(1-pRef); + + final List originalPriors = new LinkedList(); + final List pNonRefN = new LinkedList(); + for ( int i = 0; i < log10LAlleles.size(); i++ ) { + final double log10LAllele1 = log10LAlleles.get(i); + final double[] L1 = MathUtils.normalizeFromLog10(new double[]{log10LAllele1, 0.0}, true); + final AFCalcResult result1 = new AFCalcResult(new int[]{1}, 1, Arrays.asList(A, C), L1, rawPriors, Collections.singletonMap(C, -10000.0)); + originalPriors.add(result1); + pNonRefN.add(log10pNonRef*(i+1)); + } + + final IndependentAllelesDiploidExactAFCalc calc = (IndependentAllelesDiploidExactAFCalc)AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 1, 2); + final List thetaNPriors = calc.applyMultiAllelicPriors(originalPriors); + + double prevPosterior = 0.0; + for ( int i = 0; i < log10LAlleles.size(); i++ ) { + final AFCalcResult thetaN = thetaNPriors.get(i); + AFCalcResult orig = null; + for ( final AFCalcResult x : originalPriors ) + if ( x.getAllelesUsedInGenotyping().equals(thetaN.getAllelesUsedInGenotyping())) + orig = x; + + Assert.assertNotNull(orig, "couldn't find original AFCalc"); + + Assert.assertEquals(orig.getLog10PriorOfAFGT0(), log10pNonRef, 1e-6); + Assert.assertEquals(thetaN.getLog10PriorOfAFGT0(), pNonRefN.get(i), 1e-6); + + Assert.assertTrue(orig.getLog10PosteriorOfAFGT0() <= prevPosterior, "AFCalc results should be sorted but " + prevPosterior + " is > original posterior " + orig.getLog10PosteriorOfAFGT0()); + prevPosterior = orig.getLog10PosteriorOfAFGT0(); + } + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTestDataSetUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTestDataSetUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTestDataSetUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTestDataSetUnitTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResultSetUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResultSetUnitTest.java new file mode 100644 index 000000000..af66d7f88 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResultSetUnitTest.java @@ -0,0 +1,249 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import net.sf.samtools.SAMFileHeader; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.SeqGraph; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.ReadThreadingGraph; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.RandomDNA; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.pairhmm.ActiveRegionTestDataSet; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +/** + * Tests for {@link AssemblyResultSet}. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class AssemblyResultSetUnitTest extends BaseTest +{ + private GenomeLocParser genomeLocParser; + private SAMFileHeader header; + + @BeforeClass + public void init() { + header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); + } + + + @Test + public void testEmptyResultSet() { + final AssemblyResultSet subject = new AssemblyResultSet(); + + Assert.assertEquals(subject.getHaplotypeList().size(), 0); + Assert.assertEquals(subject.getHaplotypeCount(),0); + Assert.assertEquals(subject.getReferenceHaplotype(),null); + Assert.assertEquals(subject.getFullReferenceWithPadding(),null); + Assert.assertEquals(subject.getPaddedReferenceLoc(),null); + Assert.assertEquals(subject.getRegionForGenotyping(),null); + Assert.assertEquals(subject.getUniqueReadThreadingGraph(10),null); + Assert.assertFalse(subject.hasMultipleKmerSizes()); + } + + @Test + public void testAddReferenceHaplotype() { + + final Haplotype ref = new Haplotype("ACGT".getBytes(),true); + ref.setGenomeLocation(genomeLocParser.createGenomeLoc("chr1",1,ref.length() + 1 )); + final AssemblyResultSet subject = new AssemblyResultSet(); + + Assert.assertTrue(subject.add(ref)); + Assert.assertFalse(subject.add(ref)); + + Assert.assertEquals(subject.getReferenceHaplotype(),ref); + Assert.assertEquals(subject.getHaplotypeCount(),1); + Assert.assertEquals(subject.getHaplotypeList().size(),1); + } + + @Test(dataProvider="assemblyResults") + public void testAddManyHaplotypes(final java.util.List assemblyResults, + final java.util.List> haplotypes) { + final AssemblyResultSet subject = new AssemblyResultSet(); + for (int i = 0; i < haplotypes.size(); i++) { + final int haplotypeCountBefore = subject.getHaplotypeCount(); + final java.util.List haplos = haplotypes.get(i); + final AssemblyResult ar = assemblyResults.get(i); + for (final Haplotype h : haplos) { + Assert.assertTrue(subject.add(h, ar)); + Assert.assertFalse(subject.add(h,ar)); + if (h.isReference()) + Assert.assertEquals(subject.getReferenceHaplotype(),h); + } + final int haplotypeCountAfter = subject.getHaplotypeCount(); + Assert.assertEquals(haplos.size(),haplotypeCountAfter - haplotypeCountBefore); + Assert.assertTrue(subject.getMaximumKmerSize() >= ar.getKmerSize()); + Assert.assertTrue(subject.getMinimumKmerSize() <= ar.getKmerSize()); + Assert.assertEquals(subject.getUniqueReadThreadingGraph(ar.getKmerSize()), ar.getThreadingGraph()); + } + } + + @Test(dataProvider="trimmingData") + public void testTrimTo(final Map haplotypesAndResultSets, final ActiveRegion original) { + final AssemblyResultSet subject = new AssemblyResultSet(); + for (final Map.Entry entry : haplotypesAndResultSets.entrySet()) + subject.add(entry.getKey(),entry.getValue()); + subject.setRegionForGenotyping(original); + final GenomeLoc originalLocation = original.getExtendedLoc(); + final int length = originalLocation.size(); + final GenomeLoc newLocation = originalLocation.setStop(originalLocation.setStart(originalLocation,originalLocation.getStart() + length / 2),originalLocation.getStop() - length / 2); + final ActiveRegion newRegion = original.trim(newLocation); + + final Map originalHaplotypesByTrimmed = new HashMap<>(haplotypesAndResultSets.size()); + for (final Haplotype h : haplotypesAndResultSets.keySet()) + originalHaplotypesByTrimmed.put(h.trim(newRegion.getExtendedLoc()), h); + + final AssemblyResultSet trimmed = subject.trimTo(newRegion); + + Assert.assertFalse(subject.wasTrimmed()); + Assert.assertTrue(trimmed.wasTrimmed()); + + for (final Haplotype h : trimmed.getHaplotypeList()) { + Assert.assertEquals(h.getGenomeLocation(),newLocation); + Assert.assertEquals(h.getBases().length,newLocation.size()); + } + } + + @DataProvider(name="trimmingData") + public Iterator trimmingData() { + final ActiveRegion activeRegion = new ActiveRegion(genomeLocParser.createGenomeLoc("chr1",1000,1100),genomeLocParser,25); + final int length = activeRegion.getExtendedLoc().size(); + final RandomDNA rnd = new RandomDNA(13); // keep it prepoducible by fixing the seed to lucky 13. + final ActiveRegionTestDataSet actd = new ActiveRegionTestDataSet(10,new String(rnd.nextBases(length)),new String[] { + "Civar:*1T*" }, new String[0], new byte[0], new byte[0], new byte[0]); + + final List haplotypes = actd.haplotypeList(); + for (final Haplotype h : haplotypes) + h.setGenomeLocation(activeRegion.getExtendedLoc()); + + final ReadThreadingGraph rtg = new ReadThreadingGraph(10); + for (final Haplotype h : haplotypes) + rtg.addSequence("seq-" + Math.abs(h.hashCode()), h.getBases(), null, h.isReference()); + final SeqGraph seqGraph = rtg.convertToSequenceGraph(); + final AssemblyResult ar = new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION,seqGraph); + ar.setThreadingGraph(rtg); + final Map result = + new HashMap<>(); + for (final Haplotype h : haplotypes) + result.put(h,ar); + return Collections.singleton(new Object[] {result,activeRegion}).iterator(); + + } + + + + + @DataProvider(name="assemblyResults") + public java.util.Iterator assemblyResults() { + final int size = THREE_KS_GRAPH_AND_HAPLOTYPES.length * (1 + TEN_KS_GRAPH_AND_HAPLOTYPES.length); + final Object[][] result = new Object[size][]; + + for (int i = 0; i < THREE_KS_GRAPH_AND_HAPLOTYPES.length; i++) { + final ReadThreadingGraph rtg = new ReadThreadingGraph((String) THREE_KS_GRAPH_AND_HAPLOTYPES[i][0]); + final AssemblyResult ar = new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION,rtg.convertToSequenceGraph()); + ar.setThreadingGraph(rtg); + final Object[] haplotypeStrings = (Object[]) THREE_KS_GRAPH_AND_HAPLOTYPES[i][1]; + final Haplotype[] haplotypes = new Haplotype[haplotypeStrings.length]; + for (int j = 0; j < haplotypeStrings.length; j++) { + haplotypes[j] = new Haplotype(((String)haplotypeStrings[j]).getBytes(),j == 0); + haplotypes[j].setGenomeLocation(genomeLocParser.createGenomeLoc("chr1",1,haplotypes[j].length() + 1)); + } + result[i] = new Object[] { Collections.singletonList(ar),Arrays.asList(Arrays.asList(haplotypes))}; + for (int j = 0; j < TEN_KS_GRAPH_AND_HAPLOTYPES.length; j++) { + final ReadThreadingGraph rtg10 = new ReadThreadingGraph((String) TEN_KS_GRAPH_AND_HAPLOTYPES[j][0]); + final AssemblyResult ar10 = new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION,rtg10.convertToSequenceGraph()); + ar10.setThreadingGraph(rtg10); + final Object[] haplotypeStrings10 = (Object[]) TEN_KS_GRAPH_AND_HAPLOTYPES[j][1]; + final Haplotype[] haplotype10 = new Haplotype[haplotypeStrings10.length]; + for (int k = 0; k < haplotypeStrings10.length; k++) { + haplotype10[k] = new Haplotype(((String)haplotypeStrings10[k]).getBytes(),false); + haplotype10[k].setGenomeLocation(genomeLocParser.createGenomeLoc("chr1", 1, haplotype10[k].length() + 1)); + } + + result[THREE_KS_GRAPH_AND_HAPLOTYPES.length + i * TEN_KS_GRAPH_AND_HAPLOTYPES.length + j] = new Object[] { Arrays.asList(ar,ar10), + Arrays.asList( Arrays.asList(haplotypes), Arrays.asList(haplotype10)) }; + } + } + return Arrays.asList(result).iterator(); + } + + + private static final Object[][] THREE_KS_GRAPH_AND_HAPLOTYPES = new Object[][] { + {"[ks=3]{REF: ACT}",new Object[] {"ACT"}}, + {"[ks=3]{REF: ACT(3) -> T(1) -> G(2) -> A}" + + "{ (3) -> A -> G -> (2) }" + + "{ (1) -> A -> G -> (2) }",new Object[] {"ACTTGA","ACTAGGA","ACTTAGGA"}}, + {"[ks=3]{REF: ACT -> C(1) -> G}{ACT -> C(1) -> G}{ACT -> C(1) -> G}", new Object[] {"ACTCG"}} , + {"[ks=3]{REF: ACT -> A(1) -> G -> A(2) -> C -> G -> T }" + + "{A(1) -> T -> A(2) }", new Object[] {"ACTAGACGT","ACTATACGT"}} , + {"[ks=3]{REF: ACT -> A -> T(2) -> C -> A -> G -> T -> A -> C -> G -> T -> A(1) -> T}" + + "{ ACT -> A -> T(2) -> C -> T -> A -> C -> G -> T -> A(1) -> T}", + new Object[] {"ACTATCAGTACGTAT","ACTATCTACGTAT"}} , + {"[ks=3]{REF: ACT -> A -> T -> C -> A -> G -> T -> A -> C -> G -> T -> A -> T}", + new Object[] {"ACTATCAGTACGTAT"}}, + {"[ks=3]{REF: ACT -> A -> T(1) }" + + "{ ACT -> A -> T(1) }", new Object[] {"ACTAT"}}, + {"[ks=3]{REF: TTT -> A(1) -> C -> T(2)}{ A(1) -> T(2) } ", new Object[] {"TTTACT","TTTAT"}} + }; + + private static final Object[][] TEN_KS_GRAPH_AND_HAPLOTYPES = new Object[][] { + {"[ks=10]{ACTAGTAAAT -> A -> T -> A -> A -> T -> A", new Object[] {"ACTAGTAAATATAATA"}}, + {"[ks=10]{ATAGTAATAA(1) -> A -> C -> T -> A(2) -> C}{ (1) -> C -> C -> C -> A(2) -> C}", + new Object[] {"ATAGTAATAAACTAC","ATAGTAATAACCCAC"}}, + + }; + +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Civar.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Civar.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Civar.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Civar.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/CivarUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/CivarUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/CivarUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/CivarUnitTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java new file mode 100644 index 000000000..57df96475 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java @@ -0,0 +1,363 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: 3/15/12 + */ + +import net.sf.picard.reference.ReferenceSequenceFile; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.*; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.*; + +/** + * Unit tests for GenotypingEngine + */ +public class GenotypingEngineUnitTest extends BaseTest { + + private static ReferenceSequenceFile seq; + private GenomeLocParser genomeLocParser; + + @BeforeClass + public void init() throws FileNotFoundException { + // sequence + seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + genomeLocParser = new GenomeLocParser(seq); + } + + @Test + public void testFindHomVarEventAllelesInSample() { + final List eventAlleles = new ArrayList(); + eventAlleles.add( Allele.create("A", true) ); + eventAlleles.add( Allele.create("C", false) ); + final List haplotypeAlleles = new ArrayList(); + haplotypeAlleles.add( Allele.create("AATA", true) ); + haplotypeAlleles.add( Allele.create("AACA", false) ); + haplotypeAlleles.add( Allele.create("CATA", false) ); + haplotypeAlleles.add( Allele.create("CACA", false) ); + final List haplotypes = new ArrayList(); + haplotypes.add(new Haplotype("AATA".getBytes())); + haplotypes.add(new Haplotype("AACA".getBytes())); + haplotypes.add(new Haplotype("CATA".getBytes())); + haplotypes.add(new Haplotype("CACA".getBytes())); + final List haplotypeAllelesForSample = new ArrayList(); + haplotypeAllelesForSample.add( Allele.create("CATA", false) ); + haplotypeAllelesForSample.add( Allele.create("CACA", false) ); + final List> alleleMapper = new ArrayList>(); + List Aallele = new ArrayList(); + Aallele.add(haplotypes.get(0)); + Aallele.add(haplotypes.get(1)); + List Callele = new ArrayList(); + Callele.add(haplotypes.get(2)); + Callele.add(haplotypes.get(3)); + alleleMapper.add(Aallele); + alleleMapper.add(Callele); + final List eventAllelesForSample = new ArrayList(); + eventAllelesForSample.add( Allele.create("C", false) ); + eventAllelesForSample.add( Allele.create("C", false) ); + + if(!compareAlleleLists(eventAllelesForSample, GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes))) { + logger.warn("calc alleles = " + GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes)); + logger.warn("expected alleles = " + eventAllelesForSample); + } + Assert.assertTrue(compareAlleleLists(eventAllelesForSample, GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes))); + } + + @Test + public void testFindHetEventAllelesInSample() { + final List eventAlleles = new ArrayList(); + eventAlleles.add( Allele.create("A", true) ); + eventAlleles.add( Allele.create("C", false) ); + eventAlleles.add( Allele.create("T", false) ); + final List haplotypeAlleles = new ArrayList(); + haplotypeAlleles.add( Allele.create("AATA", true) ); + haplotypeAlleles.add( Allele.create("AACA", false) ); + haplotypeAlleles.add( Allele.create("CATA", false) ); + haplotypeAlleles.add( Allele.create("CACA", false) ); + haplotypeAlleles.add( Allele.create("TACA", false) ); + haplotypeAlleles.add( Allele.create("TTCA", false) ); + haplotypeAlleles.add( Allele.create("TTTA", false) ); + final List haplotypes = new ArrayList(); + haplotypes.add(new Haplotype("AATA".getBytes())); + haplotypes.add(new Haplotype("AACA".getBytes())); + haplotypes.add(new Haplotype("CATA".getBytes())); + haplotypes.add(new Haplotype("CACA".getBytes())); + haplotypes.add(new Haplotype("TACA".getBytes())); + haplotypes.add(new Haplotype("TTCA".getBytes())); + haplotypes.add(new Haplotype("TTTA".getBytes())); + final List haplotypeAllelesForSample = new ArrayList(); + haplotypeAllelesForSample.add( Allele.create("TTTA", false) ); + haplotypeAllelesForSample.add( Allele.create("AATA", true) ); + final List> alleleMapper = new ArrayList>(); + List Aallele = new ArrayList(); + Aallele.add(haplotypes.get(0)); + Aallele.add(haplotypes.get(1)); + List Callele = new ArrayList(); + Callele.add(haplotypes.get(2)); + Callele.add(haplotypes.get(3)); + List Tallele = new ArrayList(); + Tallele.add(haplotypes.get(4)); + Tallele.add(haplotypes.get(5)); + Tallele.add(haplotypes.get(6)); + alleleMapper.add(Aallele); + alleleMapper.add(Callele); + alleleMapper.add(Tallele); + final List eventAllelesForSample = new ArrayList(); + eventAllelesForSample.add( Allele.create("A", true) ); + eventAllelesForSample.add( Allele.create("T", false) ); + + if(!compareAlleleLists(eventAllelesForSample, GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes))) { + logger.warn("calc alleles = " + GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes)); + logger.warn("expected alleles = " + eventAllelesForSample); + } + Assert.assertTrue(compareAlleleLists(eventAllelesForSample, GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes))); + } + + private boolean compareAlleleLists(List l1, List l2) { + if( l1.size() != l2.size() ) { + return false; // sanity check + } + + for( int i=0; i < l1.size(); i++ ){ + if ( !l2.contains(l1.get(i)) ) + return false; + } + return true; + } + + + private class BasicGenotypingTestProvider extends TestDataProvider { + byte[] ref; + byte[] hap; + Map expected; + + public BasicGenotypingTestProvider(String refString, String hapString, Map expected) { + super(BasicGenotypingTestProvider.class, String.format("Haplotype to VCF test: ref = %s, alignment = %s", refString,hapString)); + ref = refString.getBytes(); + hap = hapString.getBytes(); + this.expected = expected; + } + + public Map calcAlignment() { + final SWPairwiseAlignment alignment = new SWPairwiseAlignment(ref, hap); + final Haplotype h = new Haplotype(hap, false, alignment.getAlignmentStart2wrt1(), alignment.getCigar()); + return GenotypingEngine.generateVCsFromAlignment( h, ref, genomeLocParser.createGenomeLoc("4",1,1+ref.length), "name"); + } + } + + @DataProvider(name = "BasicGenotypingTestProvider") + public Object[][] makeBasicGenotypingTests() { + + for( int contextSize : new int[]{0,1,5,9,24,36} ) { + Map map = new HashMap(); + map.put(1 + contextSize, (byte)'M'); + final String context = Utils.dupString('G', contextSize); + new BasicGenotypingTestProvider(context + "AGCTCGCATCGCGAGCATCGACTAGCCGATAG" + context, "CGCTCGCATCGCGAGCATCGACTAGCCGATAG", map); + } + + for( int contextSize : new int[]{0,1,5,9,24,36} ) { + Map map = new HashMap(); + map.put(2 + contextSize, (byte)'M'); + map.put(21 + contextSize, (byte)'M'); + final String context = Utils.dupString('G', contextSize); + new BasicGenotypingTestProvider(context + "AGCTCGCATCGCGAGCATCGACTAGCCGATAG", "ATCTCGCATCGCGAGCATCGCCTAGCCGATAG", map); + } + + for( int contextSize : new int[]{0,1,5,9,24,36} ) { + Map map = new HashMap(); + map.put(1 + contextSize, (byte)'M'); + map.put(20 + contextSize, (byte)'I'); + final String context = Utils.dupString('G', contextSize); + new BasicGenotypingTestProvider(context + "AGCTCGCATCGCGAGCATCGACTAGCCGATAG" + context, "CGCTCGCATCGCGAGCATCGACACTAGCCGATAG", map); + } + + for( int contextSize : new int[]{0,1,5,9,24,36} ) { + Map map = new HashMap(); + map.put(1 + contextSize, (byte)'M'); + map.put(20 + contextSize, (byte)'D'); + final String context = Utils.dupString('G', contextSize); + new BasicGenotypingTestProvider(context + "AGCTCGCATCGCGAGCATCGACTAGCCGATAG" + context, "CGCTCGCATCGCGAGCATCGCTAGCCGATAG", map); + } + + for( int contextSize : new int[]{1,5,9,24,36} ) { + Map map = new HashMap(); + map.put(1, (byte)'M'); + map.put(20, (byte)'D'); + final String context = Utils.dupString('G', contextSize); + new BasicGenotypingTestProvider("AGCTCGCATCGCGAGCATCGACTAGCCGATAG" + context, "CGCTCGCATCGCGAGCATCGCTAGCCGATAG", map); + } + + for( int contextSize : new int[]{0,1,5,9,24,36} ) { + Map map = new HashMap(); + map.put(2 + contextSize, (byte)'M'); + map.put(20 + contextSize, (byte)'I'); + map.put(30 + contextSize, (byte)'D'); + final String context = Utils.dupString('G', contextSize); + new BasicGenotypingTestProvider(context + "AGCTCGCATCGCGAGCATCGACTAGCCGATAG" + context, "ACCTCGCATCGCGAGCATCGTTACTAGCCGATG", map); + } + + for( int contextSize : new int[]{0,1,5,9,24,36} ) { + Map map = new HashMap(); + map.put(1 + contextSize, (byte)'M'); + map.put(20 + contextSize, (byte)'D'); + map.put(28 + contextSize, (byte)'M'); + final String context = Utils.dupString('G', contextSize); + new BasicGenotypingTestProvider(context + "AGCTCGCATCGCGAGCATCGACTAGCCGATAG" + context, "CGCTCGCATCGCGAGCATCGCTAGCCCATAG", map); + } + + return BasicGenotypingTestProvider.getTests(BasicGenotypingTestProvider.class); + } + + @Test(dataProvider = "BasicGenotypingTestProvider", enabled = true) + public void testHaplotypeToVCF(BasicGenotypingTestProvider cfg) { + Map calculatedMap = cfg.calcAlignment(); + Map expectedMap = cfg.expected; + logger.warn(String.format("Test: %s", cfg.toString())); + if(!compareVCMaps(calculatedMap, expectedMap)) { + logger.warn("calc map = " + calculatedMap); + logger.warn("expected map = " + expectedMap); + } + Assert.assertTrue(compareVCMaps(calculatedMap, expectedMap)); + } + + @Test(dataProvider="AddMiscellaneousDataProvider", enabled=false) + public void testAddMiscellaneousAllele(final String readBases, final int readOffset, + final String ref, final int refOffset, + final String referenceAllele, final String[] alternatives, final double[] likelihoods, final double[] expected) { + final byte baseQual = (byte)30; + + final byte[] baseQuals = Utils.dupBytes(baseQual, readBases.length()); + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(readBases.getBytes(), baseQuals, readBases.length() + "M"); + final GenomeLoc loc = new UnvalidatingGenomeLoc("20",0,refOffset,refOffset); + final ReadBackedPileup pileup = new ReadBackedPileupImpl(loc,Collections.singletonList(read),readOffset); + final VariantContextBuilder vcb = new VariantContextBuilder(); + final GenotypeBuilder gb = new GenotypeBuilder(); + final List alleleStrings = new ArrayList<>( 1 + alternatives.length); + alleleStrings.add(referenceAllele); + alleleStrings.addAll(Arrays.asList(alternatives)); + + gb.AD(new int[] { 1 }); + gb.DP(1); + gb.PL(likelihoods); + + vcb.alleles(alleleStrings); + vcb.loc("20",refOffset,refOffset + referenceAllele.length() -1); + + vcb.genotypes(gb.make()); + + final VariantContext vc = vcb.make(); + + final VariantContext updatedVc = null; // GenotypingEngine.addMiscellaneousAllele(vc,pileup,ref.getBytes(),0); + final GenotypeLikelihoods updatedLikelihoods = updatedVc.getGenotype(0).getLikelihoods(); + Assert.assertEquals(updatedLikelihoods.getAsVector().length, expected.length); + final double[] updatedLikelihoodsArray = updatedVc.getGenotype(0).getLikelihoods().getAsVector(); + for (int i = 0; i < updatedLikelihoodsArray.length; i++) { + Assert.assertEquals(updatedLikelihoodsArray[i],expected[i],0.0001); + } + Allele altAllele = null; + for (final Allele allele : updatedVc.getAlleles()) + if (allele.isSymbolic() && allele.getBaseString().equals(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE_NAME)) + altAllele = allele; + Assert.assertNotNull(altAllele); + } + + @DataProvider(name="AddMiscellaneousDataProvider") + public Iterator addMiscellaneousAlleleDataProvider() { + return Arrays.asList(ADD_MISCELLANEOUS_ALLELE_DATA).iterator(); + } + + private static final double MATCH_LnLK = QualityUtils.qualToProbLog10((byte)30); + private static final double MISS_LnLK = QualityUtils.qualToErrorProbLog10((byte)30); + + private static final Object[][] ADD_MISCELLANEOUS_ALLELE_DATA = new Object[][] { + new Object[] {"ACTG", 0,"ACTGTGAGTATTCC",0,"A",new String[]{}, new double[] {MATCH_LnLK * MATCH_LnLK}, 6, + new double[] {MATCH_LnLK * MATCH_LnLK,MATCH_LnLK * MISS_LnLK, MISS_LnLK * MISS_LnLK}} + }; + + /** + * Private function to compare Map of VCs, it only checks the types and start locations of the VariantContext + */ + private boolean compareVCMaps(Map calc, Map expected) { + if( !calc.keySet().equals(expected.keySet()) ) { return false; } // sanity check + for( Integer loc : expected.keySet() ) { + Byte type = expected.get(loc); + switch( type ) { + case 'I': + if( !calc.get(loc).isSimpleInsertion() ) { return false; } + break; + case 'D': + if( !calc.get(loc).isSimpleDeletion() ) { return false; } + break; + case 'M': + if( !(calc.get(loc).isMNP() || calc.get(loc).isSNP()) ) { return false; } + break; + default: + return false; + } + } + return true; + } +} \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HCLikelihoodCalculationEnginesBenchmark.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HCLikelihoodCalculationEnginesBenchmark.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HCLikelihoodCalculationEnginesBenchmark.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HCLikelihoodCalculationEnginesBenchmark.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java new file mode 100644 index 000000000..f547e12cc --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -0,0 +1,99 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +import static org.broadinstitute.sting.gatk.walkers.haplotypecaller.HaplotypeCallerIntegrationTest.NA12878_CHR20_BAM; +import static org.broadinstitute.sting.gatk.walkers.haplotypecaller.HaplotypeCallerIntegrationTest.REF; + +public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends WalkerTest { + + private void HCTestComplexVariants(String bam, String args, String md5) { + final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:10028767-10028967 -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 4"; + final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(base + " " + args, Arrays.asList(md5)); + executeTest("testHaplotypeCallerComplexVariants: args=" + args, spec); + } + + @Test + public void testHaplotypeCallerMultiSampleComplex1() { + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "65c316f1f3987d7bc94e887999920d45"); + } + + private void HCTestSymbolicVariants(String bam, String args, String md5) { + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:5947969-5948369 -L 20:61091236-61091636 --no_cmdline_in_header -o %s -minPruning 1"; + final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(base + " " + args, Arrays.asList(md5)); + executeTest("testHaplotypeCallerSymbolicVariants: args=" + args, spec); + } + + // TODO -- need a better symbolic allele test + @Test + public void testHaplotypeCallerSingleSampleSymbolic() { + HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "e746a38765298acd716194aee4d93554"); + } + + private void HCTestComplexGGA(String bam, String args, String md5) { + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " --no_cmdline_in_header -o %s -minPruning 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf"; + final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); + executeTest("testHaplotypeCallerComplexGGA: args=" + args, spec); + } + + @Test + public void testHaplotypeCallerMultiSampleGGAComplex() { + HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538", + "724a05b7df716647014f29c0fe86e071"); + } + + @Test + public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { + HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", + "f50e0b35e2240b19b1b8b6dfa0cf9796"); + } +} diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java new file mode 100644 index 000000000..8ca67f31d --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java @@ -0,0 +1,156 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variant.GATKVCFIndexType; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { + @DataProvider(name = "MyDataProvider") + public Object[][] makeMyDataProvider() { + List tests = new ArrayList<>(); + + final String PCRFreeIntervals = "-L 20:10,000,000-10,010,000"; + final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals; + + // this functionality can be adapted to provide input data for whatever you might want in your data + tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.NONE, PCRFreeIntervals, "50323a284788c8220c9226037c7003b5"}); + tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "7c16aa8e35de9f418533efac3bae6551"}); + tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "7e1e193d70187774f9740d475e0f1cc1"}); + tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.NONE, WExIntervals, "39bf5fe3911d0c646eefa8f79894f4df"}); + tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "d926d653500a970280ad7828d9ee2b84"}); + tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.GVCF, WExIntervals, "83ddc16e4f0900429b2da30e582994aa"}); + + return tests.toArray(new Object[][]{}); + } + + /** + * Example testng test using MyDataProvider + */ + @Test(dataProvider = "MyDataProvider") + public void testHCWithGVCF(String bam, HaplotypeCaller.ReferenceConfidenceMode mode, String intervals, String md5) { + final String commandLine = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s %s -ERC %s --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d", + b37KGReference, bam, intervals, mode, HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER); + final String name = "testHCWithGVCF bam=" + bam + " intervals= " + intervals + " gvcf= " + mode; + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList(md5)); + final Pair,List> executionOutput = executeTest(name, spec); + } + + @Test + public void testERCRegionWithNoCalledHaplotypes() { + final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF -variant_index_type %s -variant_index_parameter %d", + b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001", HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("")); + spec.disableShadowBCF(); + executeTest("testERCRegionWithNoCalledHaplotypes", spec); + } + + @Test() + public void testMissingGVCFIndexException() { + final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF", + b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001"); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", 1, UserException.GVCFIndexException.class); + spec.disableShadowBCF(); + executeTest("testMissingGVCFIndexingStrategyException", spec); + } + + @Test() + public void testWrongParameterGVCFIndexException() { + final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF -variant_index_type %s -variant_index_parameter %d", + b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001", HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER + 1); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", 1, UserException.GVCFIndexException.class); + spec.disableShadowBCF(); + executeTest("testMissingGVCFIndexingStrategyException", spec); + } + + @Test() + public void testWrongTypeGVCFIndexException() { + // ensure non-optimal, if optimal changes + GATKVCFIndexType type = GATKVCFIndexType.DYNAMIC_SEEK; + if (HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE == GATKVCFIndexType.DYNAMIC_SEEK) + type = GATKVCFIndexType.DYNAMIC_SIZE; + + final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF -variant_index_type %s -variant_index_parameter %d", + b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001", type, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", 1, UserException.GVCFIndexException.class); + spec.disableShadowBCF(); + executeTest("testMissingGVCFIndexingStrategyException", spec); + } + + private final static String WRONG_GVCF_RECORD_ORDER_BUGFIX_INTERVALS = privateTestDir + "gvcf_unsorted_records_bug.interval_list"; + private final static String WRONG_GVCF_RECORD_ORDER_BUGFIX_BAM = privateTestDir + "gvcf_unsorted_records_bug.bam"; + + @Test() + public void testWrongGVCFNonVariantRecordOrderBugFix() { + final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d", + b37KGReference, WRONG_GVCF_RECORD_ORDER_BUGFIX_BAM, WRONG_GVCF_RECORD_ORDER_BUGFIX_INTERVALS, HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("324eb46738a364cd7dc5fa0b62491a5e")); + spec.disableShadowBCF(); + executeTest("testMissingGVCFIndexingStrategyException", spec); + } + + private static final String NOCALL_GVCF_BUGFIX_INTERVALS = privateTestDir + "gvcf_nocall_bug.interval_list"; + private static final String NOCALL_GVCF_BUGFIX_BAM = privateTestDir + "gvcf_nocall_bug.bam"; + + @Test + public void testNoCallGVCFMissingPLsBugFix() { + final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d", + b37KGReference, NOCALL_GVCF_BUGFIX_BAM, NOCALL_GVCF_BUGFIX_INTERVALS, HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("4e2c20650c4c5ae6fa44b289eae5771d")); + spec.disableShadowBCF(); + executeTest("testNoCallGVCFMissingPLsBugFix", spec); + } +} diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java new file mode 100644 index 000000000..615c62c43 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -0,0 +1,323 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import net.sf.picard.reference.IndexedFastaSequenceFile; +import org.broad.tribble.readers.LineIterator; +import org.broad.tribble.readers.PositionalBufferedStream; +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.vcf.VCFCodec; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.*; + +public class HaplotypeCallerIntegrationTest extends WalkerTest { + final static String REF = b37KGReference; + final static String NA12878_BAM = privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam"; + final static String NA12878_CHR20_BAM = validationDataLocation + "NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam"; + final static String CEUTRIO_BAM = validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam"; + final static String NA12878_RECALIBRATED_BAM = privateTestDir + "NA12878.100kb.BQSRv2.example.bam"; + final static String NA12878_PCRFREE = privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam"; + final static String NA12878_PCRFREE250_ADAPTER_TRIMMED = privateTestDir + "PCRFree.2x250.b37_decoy.NA12878.adapter_trimmed-10000000-11000000.bam"; + final static String CEUTRIO_MT_TEST_BAM = privateTestDir + "CEUTrio.HiSeq.b37.MT.1_50.bam"; + final static String INTERVALS_FILE = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals"; + + private void HCTest(String bam, String args, String md5) { + final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R %s -I %s -L %s", REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3"; + final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); + executeTest("testHaplotypeCaller: args=" + args, spec); + } + + @Test + public void testHaplotypeCallerMultiSample() { + HCTest(CEUTRIO_BAM, "", "489073bf0034fe9f10e6472ab93a17eb"); + } + + @Test + public void testHaplotypeCallerSingleSample() { + HCTest(NA12878_BAM, "", "96f299a5cf411900b8eda3845c3ce465"); + } + + @Test + public void testHaplotypeCallerMinBaseQuality() { + HCTest(NA12878_BAM, "-mbq 15", "6509cfd0554ecbb92a1b303fbcc0fcca"); + } + + @Test + public void testHaplotypeCallerGraphBasedSingleSample() { + HCTest(NA12878_BAM, "-likelihoodEngine GraphBased", "83fe0694621bc1e0240f6f79eb6d6999"); + } + + @Test + public void testHaplotypeCallerGraphBasedMultiSample() { + HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased", "d45b2b26434dd3bd48df5a43b3d2954a"); + } + + @Test(enabled = false) // can't annotate the rsID's yet + public void testHaplotypeCallerSingleSampleWithDbsnp() { + HCTest(NA12878_BAM, "-D " + b37dbSNP132, ""); + } + + @Test + public void testHaplotypeCallerMultiSampleGGA() { + HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", + "a1e59313516c2d5eeedae8348b0bdff1"); + } + + @Test + public void testHaplotypeCallerInsertionOnEdgeOfContig() { + HCTest(CEUTRIO_MT_TEST_BAM, "-L MT:1-10", "7f1fb8f9587f64643f6612ef1dd6d4ae"); + } + + private void HCTestIndelQualityScores(String bam, String args, String md5) { + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:10,005,000-10,025,000 --no_cmdline_in_header -o %s -minPruning 2"; + final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); + executeTest("testHaplotypeCallerIndelQualityScores: args=" + args, spec); + } + + @Test + public void testHaplotypeCallerSingleSampleIndelQualityScores() { + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "d3fc49d3d3c8b6439548133e03faa540"); + } + + private void HCTestNearbySmallIntervals(String bam, String args, String md5) { + try { + final IndexedFastaSequenceFile fasta = new IndexedFastaSequenceFile(new File(b37KGReference)); + final GenomeLocParser parser = new GenomeLocParser(fasta.getSequenceDictionary()); + + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:10,001,603-10,001,642 -L 20:10,001,653-10,001,742 --no_cmdline_in_header -o %s"; + final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); + for( final File vcf : executeTest("testHaplotypeCallerNearbySmallIntervals: args=" + args, spec).getFirst() ) { + if( containsDuplicateRecord(vcf, parser) ) { + throw new IllegalStateException("Duplicate records detected but there should be none."); + } + } + } catch( FileNotFoundException e ) { + throw new IllegalStateException("Could not find the b37 reference file."); + } + } + + private boolean containsDuplicateRecord( final File vcf, final GenomeLocParser parser ) { + final List> VCs = new ArrayList<>(); + try { + for( final VariantContext vc : GATKVCFUtils.readVCF(vcf).getSecond() ) { + VCs.add(new Pair<>(parser.createGenomeLoc(vc), new GenotypingEngine.Event(vc))); + } + } catch( IOException e ) { + throw new IllegalStateException("Somehow the temporary VCF from the integration test could not be read."); + } + + final Set> VCsAsSet = new HashSet<>(VCs); + return VCsAsSet.size() != VCs.size(); // The set will remove duplicate Events. + } + + + @Test + public void testHaplotypeCallerNearbySmallIntervals() { + HCTestNearbySmallIntervals(NA12878_BAM, "", "a415bc76231a04dc38412ff38aa0dc49"); + } + + // This problem bam came from a user on the forum and it spotted a problem where the ReadClipper + // was modifying the GATKSamRecord and that was screwing up the traversal engine from map call to + // map call. So the test is there for consistency but not for correctness. I'm not sure we can trust + // any of the calls in that region because it is so messy. + @Test + public void HCTestProblematicReadsModifiedInActiveRegions() { + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("763d4d8d84a4080db18235a413478660")); + executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); + } + + @Test + public void HCTestStructuralIndels() { + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("91717e5e271742c2c9b67223e58f1320")); + executeTest("HCTestStructuralIndels: ", spec); + } + + @Test + public void HCTestDoesNotFailOnBadRefBase() { + // don't care about the output - just want to make sure it doesn't fail + final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "NA12878.readsOverBadBase.chr3.bam") + " --no_cmdline_in_header -o /dev/null -L 3:60830000-60840000 --minPruning 3 -stand_call_conf 2 -stand_emit_conf 2"; + final WalkerTestSpec spec = new WalkerTestSpec(base, Collections.emptyList()); + executeTest("HCTestDoesNotFailOnBadRefBase: ", spec); + } + + @Test + public void HCTestDanglingTailMergingForDeletions() throws IOException { + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, NA12878_BAM) + " --no_cmdline_in_header -o %s -L 20:10130740-10130800"; + final WalkerTestSpec spec = new WalkerTestSpec(base, 1, Arrays.asList("")); + final File outputVCF = executeTest("HCTestDanglingTailMergingForDeletions", spec).getFirst().get(0); + + // confirm that the call is the correct one + final VCFCodec codec = new VCFCodec(); + final FileInputStream s = new FileInputStream(outputVCF); + final LineIterator lineIterator = codec.makeSourceFromStream(new PositionalBufferedStream(s)); + codec.readHeader(lineIterator); + final String line = lineIterator.next(); + Assert.assertFalse(line == null); + final VariantContext vc = codec.decode(line); + Assert.assertTrue(vc.isBiallelic()); + Assert.assertTrue(vc.getReference().basesMatch("ATGTATG")); + Assert.assertTrue(vc.getAlternateAllele(0).basesMatch("A")); + } + + + // -------------------------------------------------------------------------------------------------------------- + // + // testing reduced reads + // + // -------------------------------------------------------------------------------------------------------------- + + @Test + public void HCTestReducedBam() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, + Arrays.asList("12c56262ed30db1249b8d722e324357c")); + executeTest("HC calling on a ReducedRead BAM", spec); + } + + @Test + public void testReducedBamWithReadsNotFullySpanningDeletion() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, + Arrays.asList("1627cf5f3a97e8b73b3c095db46aef1b")); + executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); + } + + // -------------------------------------------------------------------------------------------------------------- + // + // test dbSNP annotation + // + // -------------------------------------------------------------------------------------------------------------- + + @Test + public void HCTestDBSNPAnnotationWGS() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1, + Arrays.asList("0864904254b2fa757991f8c2dac4932d")); + executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec); + } + + @Test + public void HCTestDBSNPAnnotationWEx() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132 + + " -L " + hg19Intervals + " -isr INTERSECTION", 1, + Arrays.asList("e39c73bbaf22b4751755d9f5bb2a8d3d")); + executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec); + } + + @Test + public void HCTestDBSNPAnnotationWGSGraphBased() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1, + Arrays.asList("df1f9410d23a550a143531ac0891f1dc")); + executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec); + } + + @Test + public void HCTestDBSNPAnnotationWExGraphBased() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132 + + " -L " + hg19Intervals + " -isr INTERSECTION", 1, + Arrays.asList("c14d7f23dedea7e7ec99a90843320c1a")); + executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec); + } + + @Test + public void HCTestGraphBasedPCRFreePositiveLogLkFix() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + hg19Reference + " --no_cmdline_in_header -I " + NA12878_PCRFREE250_ADAPTER_TRIMMED + " -o %s -L 20:10,000,000-11,000,000 " + , 1, + Arrays.asList("")); + executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec); + } + + // -------------------------------------------------------------------------------------------------------------- + // + // test PCR indel model + // + // -------------------------------------------------------------------------------------------------------------- + + @Test + public void HCTestAggressivePcrIndelModelWGS() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T HaplotypeCaller --disableDithering --pcr_indel_model AGGRESSIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1, + Arrays.asList("f426f4c2986e1dea8f3f55951ef8e013")); + executeTest("HC calling with aggressive indel error modeling on WGS intervals", spec); + } + + @Test + public void HCTestConservativePcrIndelModelWGS() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T HaplotypeCaller --disableDithering --pcr_indel_model CONSERVATIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1, + Arrays.asList("616cc63d5a78765145914457dec475b0")); + executeTest("HC calling with conservative indel error modeling on WGS intervals", spec); + } + + @Test + public void testNoSuchEdgeBugFix() { + final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -dontTrimActiveRegions -ERC GVCF " + + "-likelihoodEngine GraphBased -variant_index_type %s -variant_index_parameter %d", + b37KGReferenceWithDecoy, privateTestDir + "graphbased_no_such_edge_bug.bam", privateTestDir + "graphbased_no_such_edge_bug.intervals.bed", + HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("")); + spec.disableShadowBCF(); + executeTest("testGraphBasedNoSuchEdgeBugFix", spec); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerModesIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerModesIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerModesIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerModesIntegrationTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java new file mode 100644 index 000000000..23513f314 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java @@ -0,0 +1,79 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.haplotypeBAMWriter.HaplotypeBAMWriter; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class HaplotypeCallerParallelIntegrationTest extends WalkerTest { + @DataProvider(name = "NCTDataProvider") + public Object[][] makeNCTDataProvider() { + List tests = new ArrayList<>(); + + for ( final int nct : Arrays.asList(1, 2, 4) ) { + tests.add(new Object[]{nct, "1f463bf3a06c401006858bc446ecea54"}); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "NCTDataProvider") + public void testHCNCT(final int nct, final String md5) { + WalkerTestSpec spec = new WalkerTestSpec( + "-T HaplotypeCaller --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + + privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam -o %s " + + " -L 20:10,000,000-10,100,000 -G none -A -contamination 0.0 -nct " + nct, 1, + Arrays.asList(md5)); + executeTest("HC test parallel HC with NCT with nct " + nct, spec); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounterCaseFixUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounterCaseFixUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounterCaseFixUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounterCaseFixUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngineUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngineUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngineUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngineUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngineUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngineUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngineUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngineUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadErrorCorrectorUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadErrorCorrectorUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadErrorCorrectorUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadErrorCorrectorUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadThreadingLikelihoodCalculationEngineUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadThreadingLikelihoodCalculationEngineUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadThreadingLikelihoodCalculationEngineUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadThreadingLikelihoodCalculationEngineUnitTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java new file mode 100644 index 000000000..7d218c19c --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java @@ -0,0 +1,420 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import net.sf.samtools.SAMFileHeader; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.*; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +public class ReferenceConfidenceModelUnitTest extends BaseTest { + GenomeLocParser parser; + final String RGID = "ID1"; + GATKSAMReadGroupRecord rg; + final String sample = "NA12878"; + final Set samples = Collections.singleton(sample); + SAMFileHeader header; + ReferenceConfidenceModel model; + + @BeforeClass + public void setUp() throws Exception { + header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); + rg = new GATKSAMReadGroupRecord(RGID); + rg.setSample(sample); + header.addReadGroup(rg); + parser = new GenomeLocParser(header.getSequenceDictionary()); + } + + @BeforeMethod + public void setupModel() { + model = new ReferenceConfidenceModel(parser, samples, header, 10); + } + + @DataProvider(name = "CalcNIndelInformativeReadsData") + public Object[][] makeMyDataProvider() { + List tests = new ArrayList<>(); + + { // very basic testing + final String ref = "ACGT"; + final String read = "ACGT"; + tests.add(new Object[]{read, ref, 1, Arrays.asList(1, 1, 1, 0)}); + tests.add(new Object[]{read, ref, 2, Arrays.asList(1, 1, 0, 0)}); + tests.add(new Object[]{read, ref, 3, Arrays.asList(1, 0, 0, 0)}); + tests.add(new Object[]{read, ref, 4, Arrays.asList(0, 0, 0, 0)}); + } + + { // actually interesting case where some sites aren't informative + final String ref = "NNAAAANN"; + final String read1 = "NNA"; + final String read2 = "NNAA"; + final String read3 = "NNAAA"; + final String read4 = "NNAAAA"; + final String read5 = "NNAAAAN"; + tests.add(new Object[]{read1, ref, 1, Arrays.asList(1, 1, 0, 0, 0, 0, 0, 0)}); + tests.add(new Object[]{read2, ref, 1, Arrays.asList(1, 1, 0, 0, 0, 0, 0, 0)}); + tests.add(new Object[]{read3, ref, 1, Arrays.asList(1, 1, 0, 0, 0, 0, 0, 0)}); + tests.add(new Object[]{read4, ref, 1, Arrays.asList(1, 1, 0, 0, 0, 0, 0, 0)}); + tests.add(new Object[]{read5, ref, 1, Arrays.asList(1, 1, 1, 1, 1, 1, 0, 0)}); + } + + { + for ( final String repeatUnit : Arrays.asList("A", "CA", "TAG", "TAGC", "TCAGA")) { + final String anchor = Utils.dupString("N", repeatUnit.length()); + for ( int nUnits = 1; nUnits < 10; nUnits++ ) { + final String repeat = Utils.dupString(repeatUnit, nUnits); + final String ref = anchor + repeat + anchor; + for ( int readLen = repeatUnit.length(); readLen < repeat.length(); readLen++ ) { + final String read = anchor + repeat.substring(0, readLen); + final List expected = new LinkedList<>(); + for ( int i = 0; i < anchor.length(); i++ ) expected.add(1); + for ( int i = 0; i < repeat.length(); i++ ) expected.add(readLen == repeat.length() ? 1 : 0); + for ( int i = 0; i < anchor.length(); i++ ) expected.add(0); + tests.add(new Object[]{read, ref, repeatUnit.length(), expected}); + + final List result = new ArrayList<>(Collections.nCopies(ref.length() - anchor.length(), 1)); + result.addAll(Collections.nCopies(anchor.length(), 0)); + tests.add(new Object[]{ref, ref, repeatUnit.length(), result}); + } + } + + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "CalcNIndelInformativeReadsData") + public void testCalcNIndelInformativeReads(final String readBases, final String ref, final int maxIndelSize, final List expected ) { + final byte qual = (byte)30; + final byte[] quals = Utils.dupBytes(qual, readBases.length()); + + for ( int i = 0; i < readBases.getBytes().length; i++ ) { + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(readBases.getBytes(), quals, readBases.length() + "M"); + final GenomeLoc loc = new UnvalidatingGenomeLoc("20", 0, i, i); + final ReadBackedPileup pileup = new ReadBackedPileupImpl(loc, Collections.singletonList(read), i); + final int actual = model.calcNIndelInformativeReads(pileup, i, ref.getBytes(), maxIndelSize); + Assert.assertEquals(actual, (int)expected.get(i), "failed at position " + i); + } + } + + @Test + public void testCalcNIndelInformativeReducedReads() { + final String bases = "ACGGGTTTGGAC"; + final byte[] quals = Utils.dupBytes((byte)30, bases.length()); + final int count = 10; + final int[] counts = new int[bases.length()]; + for ( int i = 0; i < counts.length; i++ ) + counts[i] = count; + final int position = 100; + + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialReducedRead(header, "foo", 0, position, counts.length, counts); + read.setReadString(bases); + read.setBaseQualities(quals); + read.setCigarString(bases.length() + "M"); + final GenomeLoc loc = new UnvalidatingGenomeLoc("20", 0, position, position); + final ReadBackedPileup pileup = new ReadBackedPileupImpl(loc, Collections.singletonList(read), 0); + final int actual = model.calcNIndelInformativeReads(pileup, 0, bases.getBytes(), 3); + Assert.assertEquals(actual, count); + } + + @Test + public void testClose() { + model.close(); + } + + @Test + public void testWorstGL() { + final GenotypeLikelihoods gq10 = GenotypeLikelihoods.fromPLField("0,10,100"); + final GenotypeLikelihoods gq20 = GenotypeLikelihoods.fromPLField("0,20,200"); + final GenotypeLikelihoods gq0 = GenotypeLikelihoods.fromPLField("20,0,200"); + + Assert.assertSame(model.getGLwithWorstGQ(gq10, gq20), gq10); + Assert.assertSame(model.getGLwithWorstGQ(gq20, gq10), gq10); + Assert.assertSame(model.getGLwithWorstGQ(gq10, gq0), gq0); + Assert.assertSame(model.getGLwithWorstGQ(gq0, gq10), gq0); + } + + @Test + public void testIndelLikelihoods() { + GenotypeLikelihoods prev = model.getIndelPLs(0); + Assert.assertEquals(prev.getAsPLs(), new int[]{0, 0, 0}); + Assert.assertEquals(-10 * prev.getLog10GQ(GenotypeType.HOM_REF), 0.0); + + for ( int i = 1; i <= ReferenceConfidenceModel.MAX_N_INDEL_INFORMATIVE_READS; i++ ) { + final GenotypeLikelihoods current = model.getIndelPLs(i); + final double prevGQ = -10 * prev.getLog10GQ(GenotypeType.HOM_REF); + final double currGQ = -10 * current.getLog10GQ(GenotypeType.HOM_REF); + Assert.assertTrue(prevGQ < currGQ, "GQ Failed with prev " + prev + " curr " + current + " at " + i); + Assert.assertTrue(prev.getAsPLs()[1] < current.getAsPLs()[1], "het PL failed with prev " + prev + " curr " + current + " at " + i); + Assert.assertTrue(prev.getAsPLs()[2] < current.getAsPLs()[2], "hom-var PL Failed with prev " + prev + " curr " + current + " at " + i); +// logger.warn("result at " + i + " is " + current); + prev = current; + } + } + + @Test + public void testOverlappingVariantContext() { + final VariantContext vc10 = GATKVariantContextUtils.makeFromAlleles("test", "chr1", 10, Arrays.asList("A", "C")); + final VariantContext vc13 = GATKVariantContextUtils.makeFromAlleles("test", "chr1", 13, Arrays.asList("A", "C")); + final VariantContext vc12_15 = GATKVariantContextUtils.makeFromAlleles("test", "chr1", 12, Arrays.asList("ACAT", "A")); + final VariantContext vc18 = GATKVariantContextUtils.makeFromAlleles("test", "chr1", 18, Arrays.asList("A", "ACAT")); + + final List calls = Arrays.asList(vc13, vc12_15, vc18, vc10); + + checkOverlapping(8, calls, null); + checkOverlapping(9, calls, null); + checkOverlapping(10, calls, vc10); + checkOverlapping(11, calls, null); + checkOverlapping(12, calls, vc12_15); + checkOverlapping(13, calls, vc13); + checkOverlapping(14, calls, vc12_15); + checkOverlapping(15, calls, vc12_15); + checkOverlapping(16, calls, null); + checkOverlapping(17, calls, null); + checkOverlapping(18, calls, vc18); + checkOverlapping(19, calls, null); + checkOverlapping(20, calls, null); + } + + private void checkOverlapping(final int pos, Collection calls, final VariantContext expected) { + final GenomeLoc loc = parser.createGenomeLoc(parser.getContigs().getSequences().get(0).getSequenceName(), pos, pos); + final VariantContext actual = model.getOverlappingVariantContext(loc, calls); + Assert.assertEquals(actual, expected); + } + + // + // test reference calculation + // + private class RefConfData { + final String ref; + final int extension; + final Haplotype refHap; + final GenomeLoc refLoc, paddedRefLoc; + final ActiveRegion region; + int readCounter = 0; + + private RefConfData(String ref, int extension) { + this.ref = ref; + this.extension = extension; + + refLoc = parser.createGenomeLoc("chr1", getStart(), getEnd()); + paddedRefLoc = parser.createGenomeLoc("chr1", getStart() - extension, getEnd() + extension); + region = new ActiveRegion(getRefLoc(), parser, extension); + final String pad = Utils.dupString("N", extension); + refHap = ReferenceConfidenceModel.createReferenceHaplotype(getActiveRegion(), (pad + ref + pad).getBytes(), getPaddedRefLoc()); + } + + public GenomeLoc getRefLoc() { return refLoc; } + public GenomeLoc getPaddedRefLoc() { return paddedRefLoc; } + public ActiveRegion getActiveRegion() { return region; } + public Haplotype getRefHap() { return refHap; } + public int getStart() { return 100; } + public int getEnd() { return getStart() + getRefLength() - 1; } + public byte[] getRefBases() { return ref.getBytes(); } + public int getRefLength() { return ref.length(); } + + public GATKSAMRecord makeRead(final int start, final int length) { + final byte[] quals = Utils.dupBytes((byte)30, length); + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read " + readCounter++, 0, start + getStart(), ref.substring(start, start + length).getBytes(), quals, length + "M"); + read.setReadGroup(rg); + return read; + } + } + + + @DataProvider(name = "RefConfidenceData") + public Object[][] makeRefConfidenceData() { + List tests = new ArrayList<>(); + + for ( int i = 0; i < 10; i++ ) { + for ( final int extension : Arrays.asList(0, 10) ) { + tests.add(new Object[]{i, extension}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "RefConfidenceData") + public void testRefConfidenceBasic(final int nReads, final int extension) { + final RefConfData data = new RefConfData("ACGTAACCGGTT", extension); + final List haplotypes = Arrays.asList(data.getRefHap()); + final List calls = Collections.emptyList(); + + for ( int i = 0; i < nReads; i++ ) { + data.getActiveRegion().add(data.makeRead(0, data.getRefLength())); + } + + final Map likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), new ArrayList<>(samples), data.getActiveRegion()); + + final List expectedDPs = Collections.nCopies(data.getActiveRegion().getLocation().size(), nReads); + final List contexts = model.calculateRefConfidence(data.getRefHap(), haplotypes, data.getPaddedRefLoc(), data.getActiveRegion(), likelihoods, calls); + checkReferenceModelResult(data, contexts, expectedDPs, calls); + } + + @Test + public void testRefConfidencePartialReads() { + final String ref = "ACGTAACCGGTT"; + for ( int readLen = 3; readLen < ref.length(); readLen++ ) { + for ( int start = 0; start < ref.length() - readLen; start++ ) { + final RefConfData data = new RefConfData(ref, 0); + final List haplotypes = Arrays.asList(data.getRefHap()); + final List calls = Collections.emptyList(); + + data.getActiveRegion().add(data.makeRead(start, readLen)); + final Map likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), new ArrayList<>(samples), data.getActiveRegion()); + + final List expectedDPs = new ArrayList<>(Collections.nCopies(data.getActiveRegion().getLocation().size(), 0)); + for ( int i = start; i < readLen + start; i++ ) expectedDPs.set(i, 1); + final List contexts = model.calculateRefConfidence(data.getRefHap(), haplotypes, data.getPaddedRefLoc(), data.getActiveRegion(), likelihoods, calls); + checkReferenceModelResult(data, contexts, expectedDPs, calls); + } + } + } + + @Test + public void testRefConfidenceWithCalls() { + final RefConfData xxxdata = new RefConfData("ACGTAACCGGTT", 0); + final int start = xxxdata.getStart(); + final int stop = xxxdata.getEnd(); + + for ( int nReads = 0; nReads < 2; nReads++ ) { + + final VariantContext vcStart = GATKVariantContextUtils.makeFromAlleles("test", "chr1", start, Arrays.asList("A", "C")); + final VariantContext vcEnd = GATKVariantContextUtils.makeFromAlleles("test", "chr1", stop, Arrays.asList("A", "C")); + final VariantContext vcMiddle = GATKVariantContextUtils.makeFromAlleles("test", "chr1", start + 2, Arrays.asList("A", "C")); + final VariantContext vcDel = GATKVariantContextUtils.makeFromAlleles("test", "chr1", start + 4, Arrays.asList("ACG", "A")); + final VariantContext vcIns = GATKVariantContextUtils.makeFromAlleles("test", "chr1", start + 8, Arrays.asList("A", "ACG")); + + final List allCalls = Arrays.asList(vcStart, vcEnd, vcMiddle, vcDel, vcIns); + + for ( int n = 1; n <= allCalls.size(); n++ ) { + for ( final List calls : Utils.makePermutations(allCalls, n, false) ) { +// logger.warn("Executing " + n + " " + calls.size()); + final RefConfData data = new RefConfData("ACGTAACCGGTT", 0); + final List haplotypes = Arrays.asList(data.getRefHap()); + for ( int i = 0; i < nReads; i++ ) { + data.getActiveRegion().add(data.makeRead(0, data.getRefLength())); + } + + final Map likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), new ArrayList<>(samples), data.getActiveRegion()); + + final List expectedDPs = Collections.nCopies(data.getActiveRegion().getLocation().size(), nReads); + final List contexts = model.calculateRefConfidence(data.getRefHap(), haplotypes, data.getPaddedRefLoc(), data.getActiveRegion(), likelihoods, calls); + checkReferenceModelResult(data, contexts, expectedDPs, calls); + } + } + } + } + + private void checkReferenceModelResult(final RefConfData data, final List contexts, final List expectedDPs, final List calls) { + Assert.assertNotNull(contexts); + + final GenomeLoc loc = data.getActiveRegion().getExtendedLoc(); + final List seenBP = new ArrayList<>(Collections.nCopies(data.getActiveRegion().getLocation().size(), false)); + + for ( int i = 0; i < loc.size(); i++ ) { + final GenomeLoc curPos = parser.createGenomeLoc(loc.getContig(), loc.getStart() + i); + final VariantContext call = model.getOverlappingVariantContext(curPos, calls); + final VariantContext refModel = model.getOverlappingVariantContext(curPos, contexts); + + if ( ! data.getActiveRegion().getLocation().containsP(curPos) ) { + // part of the extended interval, but not the full interval + Assert.assertNull(refModel); + continue; + } + + if ( call != null ) { + Assert.assertEquals(refModel, call, "Should have found call " + call + " but found " + refModel + " instead"); + } else { + final int expectedDP = expectedDPs.get(curPos.getStart() - data.getActiveRegion().getLocation().getStart()); + Assert.assertEquals(refModel.getStart(), loc.getStart() + i); + Assert.assertEquals(refModel.getEnd(), loc.getStart() + i); + Assert.assertFalse(refModel.hasLog10PError()); + Assert.assertEquals(refModel.getAlternateAlleles().size(), 1); + Assert.assertEquals(refModel.getAlternateAllele(0), GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + Assert.assertTrue(refModel.hasGenotype(sample)); + + final Genotype g = refModel.getGenotype(sample); + Assert.assertTrue(g.hasAD()); + Assert.assertTrue(g.hasDP()); + Assert.assertEquals(g.getDP(), expectedDP); + Assert.assertTrue(g.hasGQ()); + Assert.assertTrue(g.hasPL()); + } + + final VariantContext vc = call == null ? refModel : call; + if ( curPos.getStart() == vc.getStart() ) { + for ( int pos = vc.getStart(); pos <= vc.getEnd(); pos++ ) { + final int j = pos - data.getActiveRegion().getLocation().getStart(); + Assert.assertFalse(seenBP.get(j)); + seenBP.set(j, true); + } + } + } + + for ( int i = 0; i < seenBP.size(); i++ ) { + Assert.assertEquals((boolean)seenBP.get(i), true); + } + } +} \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdgeUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdgeUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdgeUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdgeUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertexUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertexUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertexUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertexUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertexUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertexUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertexUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertexUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtilsUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtilsUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtilsUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtilsUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/HaplotypeGraphUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/HaplotypeGraphUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/HaplotypeGraphUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/HaplotypeGraphUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/LowWeightChainPrunerUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/LowWeightChainPrunerUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/LowWeightChainPrunerUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/LowWeightChainPrunerUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/MultiSampleEdgeUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/MultiSampleEdgeUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/MultiSampleEdgeUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/MultiSampleEdgeUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/PathUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/PathUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/PathUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/PathUnitTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RouteUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RouteUnitTest.java new file mode 100644 index 000000000..aeb617b18 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RouteUnitTest.java @@ -0,0 +1,285 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +import org.broadinstitute.sting.BaseTest; +import org.jgrapht.EdgeFactory; +import org.testng.Assert; +import org.testng.Reporter; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +/** + * Created with IntelliJ IDEA. + * User: valentin + * Date: 9/5/13 + * Time: 11:04 AM + * To change this template use File | Settings | File Templates. + */ +public class RouteUnitTest extends BaseTest { + + @Test(dataProvider="slicePrefixTestData") + public void testSplicePrefix(final Route route) { + final int routeLength = route.length(); + for (int i = 0; i < routeLength; i++) { + final Route spliced = route.splicePrefix(i); + Assert.assertEquals(spliced.length(),route.length() - i); + final List routeEdges = route.getEdges(); + final List expectedSlicedEdges = routeEdges.subList(i,routeLength); + Assert.assertEquals(spliced.getEdges(),expectedSlicedEdges); + } + } + + @Test(dataProvider="isSuffixTestData") + public void testIsSuffix(final Route route, final Path path, final boolean expectedResult) { + Assert.assertEquals(route.isSuffix(path), expectedResult); + } + + @DataProvider(name="isSuffixTestData") + public Iterator isSuffixTestData() { + return IS_SUFFIX_TEST_DATA.iterator(); + } + + @DataProvider(name="slicePrefixTestData") + public Iterator slicePrefixTestData() { + return Arrays.asList(SLICE_PREFIX_TEST_DATA).iterator(); + } + + private static final int[] TEST_EDGE_PAIRS1 = new int[] { + 3 , 4, + 4 , 5, + 5, 7, + 7, 8, + 8, 9, + 4 , 6, + 6, 9, + 9, 11, + 11, 12, + }; + + private static final int[] TEST_EDGE_PAIRS = new int[] { + 1 , 2, + 2 , 3, + 3 , 4, + 4 , 5, + 5, 7, + 7, 8, + 8, 9, + 4 , 6, + 6, 9, + 9, 10, + 10, 11, + 11, 12, + 2, 5, + 5, 12, + + 3, 13, + 13, 14, + 14, 15 + }; + + public static final EdgeFactory TEST_GRAPH_EDGE_FACTORY = new EdgeFactory() { + @Override + public BaseEdge createEdge(final BaseVertex baseVertex, final BaseVertex baseVertex2) { + return new BaseEdge(false, 0); + } + }; + + + private static Map vertexByInteger = new HashMap<>(); + private static final BaseGraph TEST_GRAPH = new BaseGraph<>(1, TEST_GRAPH_EDGE_FACTORY); + private static final List IS_SUFFIX_TEST_DATA; + + private static final Object[][] SLICE_PREFIX_TEST_DATA; + + static { + for (int i = 0; i < TEST_EDGE_PAIRS.length; i += 2) { + final int sourceInteger = TEST_EDGE_PAIRS[i]; + final int targetInteger = TEST_EDGE_PAIRS[i + 1]; + final BaseVertex sourceVertex = resolveVertexByInteger(sourceInteger); + final BaseVertex targetVertex = resolveVertexByInteger(targetInteger); + TEST_GRAPH.addEdge(sourceVertex, targetVertex); + } + Assert.assertEquals(1,TEST_GRAPH.getSources().size()); + final Deque> pendingPaths = new LinkedList<>(); + final Deque> pendingRoutes = new LinkedList<>(); + final List> allPossiblePaths = new LinkedList<>(); + final List> allPossibleRoutes = new LinkedList<>(); + for (final BaseVertex vertex : TEST_GRAPH.vertexSet()) { + pendingPaths.add(new Path(vertex, TEST_GRAPH)); + pendingRoutes.add(new Route(vertex,TEST_GRAPH)); + } + while (!pendingPaths.isEmpty()) { // !pendingRoutes.isEmpty(); + final Path path = pendingPaths.remove(); + final Route route = pendingRoutes.remove(); + final BaseVertex lastVertex = path.getLastVertex(); + allPossiblePaths.add(path); + allPossibleRoutes.add(route); + + if (allPossiblePaths.size() % 100 == 0) + Reporter.log("" + allPossiblePaths.size(), true); + for (final BaseEdge edge : TEST_GRAPH.outgoingEdgesOf(lastVertex)) + pendingPaths.add(new Path<>(path,edge)); + for (final BaseEdge edge : TEST_GRAPH.outgoingEdgesOf(lastVertex)) + pendingRoutes.add(new Route<>(route,edge)); + } + + final int numberOfPaths = allPossiblePaths.size(); + final boolean[][] isSuffix = buildIsSuffixMatrix(allPossiblePaths, numberOfPaths); + IS_SUFFIX_TEST_DATA = createTestData(allPossiblePaths,allPossibleRoutes,isSuffix); + SLICE_PREFIX_TEST_DATA = createSlicePrefixTestData(allPossibleRoutes); + } + + private static Object[][] createSlicePrefixTestData(List> allPossibleRoutes) { + final Object[][] result = new Object[allPossibleRoutes.size()][1]; + final Object[] routes = allPossibleRoutes.toArray(); + for (int i = 0; i < result.length; i++) + result[i][0] = routes[i]; + return result; + } + + private static boolean[][] buildIsSuffixMatrix(final List> allPossiblePaths, final int numberOfPaths) { + final boolean[][] isSuffix = new boolean[numberOfPaths][numberOfPaths]; + final ListIterator> iIterator = allPossiblePaths.listIterator(); + for (int i = 0; i < numberOfPaths; i++) { + isSuffix[i][i] = true; + final ListIterator> jIterator = allPossiblePaths.listIterator(i + 1); + final Path iPath = iIterator.next(); + for (int j = i + 1; j < numberOfPaths; j++) { + final Path jPath = jIterator.next(); + if (iPath.getLastVertex() != jPath.getLastVertex()) { + isSuffix[i][j] = isSuffix[j][i] = false; + } else { + isSuffix[i][j] = isSuffix[j][i] = true; // let assume they are suffix of each other by default. + final Path shortPath; + final Path longPath; + if (iPath.getEdges().size() <= jPath.getEdges().size()) { + shortPath = iPath; + longPath = jPath; + } else { + longPath = iPath; + shortPath = jPath; + } + final ListIterator longPathEdgesIterator = longPath.getEdges().listIterator(longPath.getEdges().size()); + final ListIterator shortPathEdgesIterator = shortPath.getEdges().listIterator(shortPath.getEdges().size()); + + while (shortPathEdgesIterator.hasPrevious()) { + final BaseEdge shortEdge = shortPathEdgesIterator.previous(); + final BaseEdge longEdge = longPathEdgesIterator.previous(); + if (shortEdge != longEdge) { + isSuffix[i][j] = isSuffix[j][i] = false; + break; + } + } + if (isSuffix[i][j]) { + if (longPathEdgesIterator.hasPrevious()) { + if (longPath == iPath) + isSuffix[j][i] = false; + else + isSuffix[i][j] = false; + } + } + } + + } + } + return isSuffix; + } + + private static List createTestData(final List> allPossiblePaths, final List> allPossibleRoutes, final boolean[][] isSuffix) { + final List result = new ArrayList<>(allPossiblePaths.size() * allPossiblePaths.size() * 2 ); + final Path[] allPaths = allPossiblePaths.toArray(new Path[allPossiblePaths.size()]); + final Route[] allRoutes = allPossibleRoutes.toArray(new Route[allPossibleRoutes.size()]); + final int numberOfPaths = allPaths.length; + for (int i = 0; i < numberOfPaths; i++) + for (int j = 0; j < numberOfPaths; j++) { + result.add(new Object[] { allRoutes[i], allPaths[j], isSuffix[i][j] }); + result.add(new Object[] { allRoutes[i], allRoutes[j], isSuffix[i][j] }); + result.add(new Object[] { allRoutes[i], inverseRebuild(allRoutes[j]), isSuffix[i][j]}); + } + + return result; + } + + private static Route inverseRebuild(final Route original) { + final ListIterator it = original.getEdges().listIterator(original.length()); + Route result = new Route<>(original.getLastVertex(),original.getGraph()); + while (it.hasPrevious()) { + result = new Route<>(it.previous(),result); + } + return result; + } + + private static BaseVertex resolveVertexByInteger(final int targetInteger) { + if (vertexByInteger.containsKey(targetInteger)) + return vertexByInteger.get(targetInteger); + else { + int value = targetInteger; + final StringBuffer stringBuffer = new StringBuffer(); + while (value > 0) { + int c = value % 4; + switch (c) { + case 0: stringBuffer.append('A'); break; + case 1: stringBuffer.append('C'); break; + case 2: stringBuffer.append('G'); break; + case 3: stringBuffer.append('T'); break; + } + value = value / 4; + } + if (stringBuffer.length() == 0) stringBuffer.append('A'); + final byte[] sequence = stringBuffer.reverse().toString().getBytes(); + final BaseVertex result = new BaseVertex(sequence); + vertexByInteger.put(targetInteger, result); + TEST_GRAPH.addVertex(result); + return result; + } + + } + + +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertexUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertexUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertexUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertexUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssemblerUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssemblerUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssemblerUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssemblerUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/SequenceForKmersUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/SequenceForKmersUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/SequenceForKmersUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/SequenceForKmersUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/ConstrainedMateFixingManagerUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/indels/ConstrainedMateFixingManagerUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/ConstrainedMateFixingManagerUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/indels/ConstrainedMateFixingManagerUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerLargeScaleTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerLargeScaleTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerLargeScaleTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerLargeScaleTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerUnitTest.java new file mode 100644 index 000000000..509bf7465 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerUnitTest.java @@ -0,0 +1,82 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.indels; + +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.samtools.SAMFileHeader; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; + +public class IndelRealignerUnitTest extends BaseTest { + + private SAMFileHeader header; + + @BeforeClass + public void setup() throws FileNotFoundException { + final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary()); + } + + @Test + public void realignAtContigBorderTest() { + final int contigEnd = header.getSequence(0).getSequenceLength(); + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "goodRead", 0, contigEnd - 1, 2); + read.setCigarString("2M"); + Assert.assertEquals(IndelRealigner.realignmentProducesBadAlignment(read, contigEnd), false); + read.setCigarString("1M1D1M"); + Assert.assertEquals(IndelRealigner.realignmentProducesBadAlignment(read, contigEnd), true); + } + +} diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModelUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModelUnitTest.java new file mode 100644 index 000000000..3480b6775 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModelUnitTest.java @@ -0,0 +1,156 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.indels; + + +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.samtools.SAMFileHeader; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.variant.variantcontext.Allele; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.*; + +public class PairHMMIndelErrorModelUnitTest extends BaseTest { + + private SAMFileHeader header; + + @BeforeClass + public void setup() throws FileNotFoundException { + final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary()); + } + + private static final int refWindowStart = 1000; + private static final int refWindowEnd = 1100; + + @DataProvider(name = "ClipUpstreamProvider") + public Object[][] ClipUpstreamTestData() { + List tests = new ArrayList(); + + for ( final int readStart : Arrays.asList(900, 950, 990, 1000) ) { + for ( final int readLength : Arrays.asList(10, 50, 100) ) { + for ( final int delLength : Arrays.asList(0, 5, 10) ) { + tests.add(new Object[]{readStart, readLength, delLength}); + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "ClipUpstreamProvider", enabled = true) + public void clipUpstreamTest(final int readStart, final int readLength, final int delLength) { + + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, readStart, readLength); + if ( delLength == 0 ) + read.setCigarString(readLength + "M"); + else + read.setCigarString((readLength / 2) + "M" + delLength + "D" + (readLength / 2) + "M"); + + final boolean result = PairHMMIndelErrorModel.mustClipUpstream(read, refWindowStart); + Assert.assertEquals(result, read.getSoftStart() < refWindowStart && read.getSoftEnd() > refWindowStart); + } + + @DataProvider(name = "ClipDownstreamProvider") + public Object[][] ClipDownstreamTestData() { + List tests = new ArrayList(); + + for ( final int readStart : Arrays.asList(1000, 1050, 1090, 1100) ) { + for ( final int readLength : Arrays.asList(10, 50, 100) ) { + for ( final int delLength : Arrays.asList(0, 5, 10) ) { + tests.add(new Object[]{readStart, readLength, delLength}); + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "ClipDownstreamProvider", enabled = true) + public void clipDownstreamTest(final int readStart, final int readLength, final int delLength) { + + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, readStart, readLength); + if ( delLength == 0 ) + read.setCigarString(readLength + "M"); + else + read.setCigarString((readLength / 2) + "M" + delLength + "D" + (readLength / 2) + "M"); + + final boolean result = PairHMMIndelErrorModel.mustClipDownstream(read, refWindowEnd); + Assert.assertEquals(result, read.getSoftStart() < refWindowEnd && read.getSoftStart() + readLength > refWindowEnd); + } + + @Test + public void clipDownstreamAtBorderTest() { + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, 5, 10); + read.setCigarString("10M"); + Assert.assertEquals(PairHMMIndelErrorModel.mustClipDownstream(read, 13), true); + Assert.assertEquals(PairHMMIndelErrorModel.mustClipDownstream(read, 14), false); + } + + @Test + public void trimHaplotypesToNullAlleleTest() { + // we need a case where start and stop > haplotype coordinates + final int start = 100, stop = 100; + final Haplotype h = new Haplotype(new byte[]{(byte)'A'}, new UnvalidatingGenomeLoc("1", 0, 10, 10)); + final Map input = new HashMap(1); + input.put(Allele.create("A"), h); + + final Map output = PairHMMIndelErrorModel.trimHaplotypes(input, start, stop, null); + Assert.assertTrue(output.isEmpty()); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/ReadBinUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/indels/ReadBinUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/ReadBinUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/indels/ReadBinUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorLargeScaleTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorLargeScaleTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorLargeScaleTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorLargeScaleTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java new file mode 100644 index 000000000..8c8817fe6 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java @@ -0,0 +1,139 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.phasing; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +public class ReadBackedPhasingIntegrationTest extends WalkerTest { + + public static String baseTestString(String reference, String reads, String VCF, int cacheWindowSize, int maxPhaseSites, double phaseQualityThresh) { + return "-T ReadBackedPhasing" + + " -R " + reference + + " -I " + validationDataLocation + reads + + " --variant " + ( VCF.contains("phasing_test") ? privateTestDir : validationDataLocation) + VCF + + " --cacheWindowSize " + cacheWindowSize + + " --maxPhaseSites " + maxPhaseSites + + " --phaseQualityThresh " + phaseQualityThresh + + " -o %s" + + " --no_cmdline_in_header"; + } + + + @Test + public void test1() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 10, 10) + + " -L chr20:332341-382503", + 1, + Arrays.asList("1bb034bd54421fe4884e3142ed92d47e")); + executeTest("MAX 10 het sites [TEST ONE]; require PQ >= 10", spec); + } + + @Test + public void test2() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 10, 10) + + " -L chr20:1232503-1332503", + 1, + Arrays.asList("c12954252d4c8659b5ecf7517b277496")); + executeTest("MAX 10 het sites [TEST TWO]; require PQ >= 10", spec); + } + + @Test + public void test3() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 2, 30) + + " -L chr20:332341-382503", + 1, + Arrays.asList("0b945e30504d04e9c6fa659ca5c25ed5")); + executeTest("MAX 2 het sites [TEST THREE]; require PQ >= 30", spec); + } + + @Test + public void test4() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 5, 100) + + " -L chr20:332341-382503", + 1, + Arrays.asList("e9e8ef92d694ca71f29737fba26282f5")); + executeTest("MAX 5 het sites [TEST FOUR]; require PQ >= 100", spec); + } + + @Test + public void test5() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 1000, 7, 10) + + " -L chr20:332341-482503", + 1, + Arrays.asList("b9c9347c760a06db635952bf4920fb48")); + executeTest("MAX 7 het sites [TEST FIVE]; require PQ >= 10; cacheWindow = 1000", spec); + } + + @Test + public void test6() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 10, 10) + + " -L chr20:652810-681757", + 1, + Arrays.asList("02c3a903842aa035ae379f16bc3d64ae")); + executeTest("MAX 10 het sites [TEST SIX]; require PQ >= 10; cacheWindow = 20000; has inconsistent sites", spec); + } + + @Test + public void test7() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "CEU.trio.2010_03.genotypes.hg18.vcf", 20000, 10, 10) + + " -L chr20:332341-802503", + 1, + Arrays.asList("ac41d1aa9c9a67c07d894f485c29c574")); + executeTest("Use trio-phased VCF, adding read-backed phasing infomration in HP tag (as is now standard for RBP) [TEST SEVEN]", spec); + } + +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReadsIntegrationTests.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReadsIntegrationTests.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReadsIntegrationTests.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReadsIntegrationTests.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReadsUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReadsUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReadsUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReadsUnitTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/simulatereads/SimulateReadsForVariantsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/simulatereads/SimulateReadsForVariantsIntegrationTest.java new file mode 100644 index 000000000..2ae904e65 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/simulatereads/SimulateReadsForVariantsIntegrationTest.java @@ -0,0 +1,95 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.simulatereads; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +public class SimulateReadsForVariantsIntegrationTest extends WalkerTest { + + @Test + public void testDefaults() { + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SimulateReadsForVariants --no_pg_tag -R " + b37KGReference + " -V " + publicTestDir + "forSimulation.vcf -o %s", + 1, + Arrays.asList("dd9e17a9c268578e903ecd4ca0a4a335")); + executeTest("testVariants", spec); + } + + @Test + public void testReadLength() { + + WalkerTestSpec spec = new WalkerTestSpec( + "-RL 70 -T SimulateReadsForVariants --no_pg_tag -R " + b37KGReference + " -V " + publicTestDir + "forSimulation.vcf -o %s", + 1, + Arrays.asList("d7388376ffd4d3826d48a5be0be70632")); + executeTest("testReadLength", spec); + } + + @Test + public void testErrorRate() { + + WalkerTestSpec spec = new WalkerTestSpec( + "-ER 40 -T SimulateReadsForVariants --no_pg_tag -R " + b37KGReference + " -V " + publicTestDir + "forSimulation.vcf -o %s", + 1, + Arrays.asList("6c9bf583f4b2708d6b82f54516474b7b")); + executeTest("testErrorRate", spec); + } + + @Test + public void testPlatformTag() { + + WalkerTestSpec spec = new WalkerTestSpec( + "-RGPL SOLID -T SimulateReadsForVariants --no_pg_tag -R " + b37KGReference + " -V " + publicTestDir + "forSimulation.vcf -o %s", + 1, + Arrays.asList("26db391f223ead74d786006a502029d8")); + executeTest("testPlatformTag", spec); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmpliconsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmpliconsIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmpliconsIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmpliconsIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/validation/ValidationSiteSelectorIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/validation/ValidationSiteSelectorIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/validation/ValidationSiteSelectorIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/validation/ValidationSiteSelectorIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalkerUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalkerUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalkerUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalkerUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManagerUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManagerUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManagerUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManagerUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibrationUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibrationUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibrationUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibrationUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManagerUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManagerUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManagerUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManagerUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModelUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModelUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModelUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModelUnitTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java new file mode 100644 index 000000000..c4de50b25 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -0,0 +1,329 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantrecalibration; + +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.variant.variantcontext.Genotype; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.vcf.VCFCodec; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.Arrays; +import java.util.List; + +public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { + private static class VRTest { + String inVCF; + String aggregateVCF; + String tranchesMD5; + String recalMD5; + String cutVCFMD5; + + public VRTest(String inVCF, String tranchesMD5, String recalMD5, String cutVCFMD5) { + this.inVCF = inVCF; + this.tranchesMD5 = tranchesMD5; + this.recalMD5 = recalMD5; + this.cutVCFMD5 = cutVCFMD5; + } + + public VRTest(String inVCF, String aggregateVCF, String tranchesMD5, String recalMD5, String cutVCFMD5) { + this.inVCF = inVCF; + this.aggregateVCF = aggregateVCF; + this.tranchesMD5 = tranchesMD5; + this.recalMD5 = recalMD5; + this.cutVCFMD5 = cutVCFMD5; + } + + @Override + public String toString() { + return "VRTest{inVCF='" + inVCF +"'}"; + } + } + + VRTest lowPass = new VRTest(validationDataLocation + "phase1.projectConsensus.chr20.raw.snps.vcf", + "41e2d951a17de433fe378bb3d9ec75d4", // tranches + "04336b2453202f286da05b69e57f66ed", // recal file + "d29fd0bdc1c8c3a171e10d29f7ffeaec"); // cut VCF + + VRTest lowPassPlusExomes = new VRTest(validationDataLocation + "phase1.projectConsensus.chr20.raw.snps.vcf", + validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf", + "ce4bfc6619147fe7ce1f8331bbeb86ce", // tranches + "1b33c10be7d8bf8e9accd11113835262", // recal file + "4700d52a06f2ef3a5882719b86911e51"); // cut VCF + + @DataProvider(name = "VRTest") + public Object[][] createData1() { + return new Object[][]{ {lowPass} }; + } + + @DataProvider(name = "VRAggregateTest") + public Object[][] createData2() { + return new Object[][]{ {lowPassPlusExomes} }; + } + + @Test(dataProvider = "VRTest") + public void testVariantRecalibrator(VRTest params) { + //System.out.printf("PARAMS FOR %s is %s%n", vcf, clusterFile); + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b37KGReference + + " -resource:known=true,prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" + + " -resource:truth=true,training=true,prior=15.0 " + comparisonDataLocation + "Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" + + " -resource:training=true,truth=true,prior=12.0 " + comparisonDataLocation + "Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf" + + " -T VariantRecalibrator" + + " -input " + params.inVCF + + " -L 20:1,000,000-40,000,000" + + " --no_cmdline_in_header" + + " -an QD -an HaplotypeScore -an HRun" + + " --trustAllPolymorphic" + // for speed + " -recalFile %s" + + " -tranchesFile %s", + Arrays.asList(params.recalMD5, params.tranchesMD5)); + executeTest("testVariantRecalibrator-"+params.inVCF, spec).getFirst(); + } + + @Test(dataProvider = "VRTest",dependsOnMethods="testVariantRecalibrator") + public void testApplyRecalibration(VRTest params) { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b37KGReference + + " -T ApplyRecalibration" + + " -L 20:12,000,000-30,000,000" + + " --no_cmdline_in_header" + + " -input " + params.inVCF + + " -U LENIENT_VCF_PROCESSING -o %s" + + " -tranchesFile " + getMd5DB().getMD5FilePath(params.tranchesMD5, null) + + " -recalFile " + getMd5DB().getMD5FilePath(params.recalMD5, null), + Arrays.asList(params.cutVCFMD5)); + spec.disableShadowBCF(); // TODO -- enable when we support symbolic alleles + executeTest("testApplyRecalibration-"+params.inVCF, spec); + } + + @Test(dataProvider = "VRAggregateTest") + public void testVariantRecalibratorAggregate(VRTest params) { + //System.out.printf("PARAMS FOR %s is %s%n", vcf, clusterFile); + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b37KGReference + + " -resource:known=true,prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" + + " -resource:truth=true,training=true,prior=15.0 " + comparisonDataLocation + "Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" + + " -resource:training=true,truth=true,prior=12.0 " + comparisonDataLocation + "Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf" + + " -T VariantRecalibrator" + + " -input " + params.inVCF + + " -aggregate " + params.aggregateVCF + + " -L 20:1,000,000-40,000,000" + + " --no_cmdline_in_header" + + " -an QD -an HaplotypeScore -an MQ" + + " --trustAllPolymorphic" + // for speed + " -recalFile %s" + + " -tranchesFile %s", + Arrays.asList(params.recalMD5, params.tranchesMD5)); + executeTest("testVariantRecalibratorAggregate-"+params.inVCF, spec).getFirst(); + } + + @Test(dataProvider = "VRAggregateTest",dependsOnMethods="testVariantRecalibratorAggregate") + public void testApplyRecalibrationAggregate(VRTest params) { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b37KGReference + + " -T ApplyRecalibration" + + " -L 20:12,000,000-30,000,000" + + " --no_cmdline_in_header" + + " -input " + params.inVCF + + " -U LENIENT_VCF_PROCESSING -o %s" + + " -tranchesFile " + getMd5DB().getMD5FilePath(params.tranchesMD5, null) + + " -recalFile " + getMd5DB().getMD5FilePath(params.recalMD5, null), + Arrays.asList(params.cutVCFMD5)); + spec.disableShadowBCF(); // TODO -- enable when we support symbolic alleles + executeTest("testApplyRecalibrationAggregate-"+params.inVCF, spec); + } + + VRTest bcfTest = new VRTest(privateTestDir + "vqsr.bcf_test.snps.unfiltered.bcf", + "3ad7f55fb3b072f373cbce0b32b66df4", // tranches + "e747c08131d58d9a4800720f6ca80e0c", // recal file + "e5808af3af0f2611ba5a3d172ab2557b"); // cut VCF + + @DataProvider(name = "VRBCFTest") + public Object[][] createVRBCFTest() { + return new Object[][]{ {bcfTest} }; + } + + @Test(dataProvider = "VRBCFTest") + public void testVariantRecalibratorWithBCF(VRTest params) { + //System.out.printf("PARAMS FOR %s is %s%n", vcf, clusterFile); + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b37KGReference + + " -resource:known=true,prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" + + " -resource:truth=true,training=true,prior=15.0 " + comparisonDataLocation + "Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" + + " -resource:training=true,truth=true,prior=12.0 " + comparisonDataLocation + "Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf" + + " -T VariantRecalibrator" + + " -input " + params.inVCF + + " -L 20:10,000,000-20,000,000" + + " --no_cmdline_in_header" + + " -an AC " + // integer value + " -an QD -an ReadPosRankSum -an FS -an InbreedingCoeff " + // floats value + " -mG 2 "+ + " -recalFile %s" + + " -tranchesFile %s", + 2, + Arrays.asList("bcf", "txt"), + Arrays.asList(params.recalMD5, params.tranchesMD5)); + executeTest("testVariantRecalibrator-"+params.inVCF, spec).getFirst(); + } + + @Test(dataProvider = "VRBCFTest", dependsOnMethods="testVariantRecalibratorWithBCF") + public void testApplyRecalibrationWithBCF(VRTest params) { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b37KGReference + + " -T ApplyRecalibration" + + " -L 20:10,000,000-20,000,000" + + " --no_cmdline_in_header" + + " -input " + params.inVCF + + " -U LENIENT_VCF_PROCESSING -o %s" + + " -tranchesFile " + getMd5DB().getMD5FilePath(params.tranchesMD5, null) + + " -recalFile " + getMd5DB().getMD5FilePath(params.recalMD5, null), + Arrays.asList(params.cutVCFMD5)); + spec.disableShadowBCF(); + executeTest("testApplyRecalibration-"+params.inVCF, spec); + } + + + VRTest indelUnfiltered = new VRTest( + validationDataLocation + "combined.phase1.chr20.raw.indels.unfiltered.sites.vcf", // all FILTERs as . + "9a331328370889168a7aa3a625f73620", // tranches + "2cbbd146d68c40200b782e0226f71976", // recal file + "64dd98a5ab80cf5fd9a36eb66b38268e"); // cut VCF + + VRTest indelFiltered = new VRTest( + validationDataLocation + "combined.phase1.chr20.raw.indels.filtered.sites.vcf", // all FILTERs as PASS + "9a331328370889168a7aa3a625f73620", // tranches + "2cbbd146d68c40200b782e0226f71976", // recal file + "c0ec662001e829f5779a9d13b1d77d80"); // cut VCF + + @DataProvider(name = "VRIndelTest") + public Object[][] createTestVariantRecalibratorIndel() { + return new Object[][]{ {indelUnfiltered}, {indelFiltered} }; + } + + @Test(dataProvider = "VRIndelTest") + public void testVariantRecalibratorIndel(VRTest params) { + //System.out.printf("PARAMS FOR %s is %s%n", vcf, clusterFile); + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b37KGReference + + " -resource:known=true,prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" + + " -resource:training=true,truth=true,prior=15.0 " + comparisonDataLocation + "Validated/Mills_Devine_Indels_2011/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.vcf" + + " -T VariantRecalibrator" + + " -input " + params.inVCF + + " -L 20:1,000,000-40,000,000" + + " --no_cmdline_in_header" + + " -an QD -an ReadPosRankSum -an HaplotypeScore" + + " -mode INDEL -mG 3" + + " --trustAllPolymorphic" + // for speed + " -recalFile %s" + + " -tranchesFile %s", + Arrays.asList(params.recalMD5, params.tranchesMD5)); + executeTest("testVariantRecalibratorIndel-"+params.inVCF, spec).getFirst(); + } + + @Test(dataProvider = "VRIndelTest",dependsOnMethods="testVariantRecalibratorIndel") + public void testApplyRecalibrationIndel(VRTest params) { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b37KGReference + + " -T ApplyRecalibration" + + " -L 20:12,000,000-30,000,000" + + " -mode INDEL" + + " -U LENIENT_VCF_PROCESSING --no_cmdline_in_header" + + " -input " + params.inVCF + + " -o %s" + + " -tranchesFile " + getMd5DB().getMD5FilePath(params.tranchesMD5, null) + + " -recalFile " + getMd5DB().getMD5FilePath(params.recalMD5, null), + Arrays.asList(params.cutVCFMD5)); + spec.disableShadowBCF(); // has to be disabled because the input VCF is missing LowQual annotation + executeTest("testApplyRecalibrationIndel-" + params.inVCF, spec); + } + + @Test + public void testApplyRecalibrationSnpAndIndelTogether() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b37KGReference + + " -T ApplyRecalibration" + + " -L 20:1000100-1000500" + + " -mode BOTH" + + " --no_cmdline_in_header" + + " -input " + privateTestDir + "VQSR.mixedTest.input" + + " -o %s" + + " -tranchesFile " + privateTestDir + "VQSR.mixedTest.tranches" + + " -recalFile " + privateTestDir + "VQSR.mixedTest.recal", + Arrays.asList("03a0ed00af6aac76d39e569f90594a02")); + executeTest("testApplyRecalibrationSnpAndIndelTogether", spec); + } + + @Test(enabled = true) + public void testApplyRecalibrationSnpAndIndelTogetherExcludeFiltered() throws Exception { + final String base = "-R " + b37KGReference + + " -T ApplyRecalibration" + + " -L 20:1000100-1000500" + + " -mode BOTH" + + " --excludeFiltered -ts_filter_level 90.0" + + " --no_cmdline_in_header" + + " -input " + privateTestDir + "VQSR.mixedTest.input" + + " -o %s" + + " -tranchesFile " + privateTestDir + "VQSR.mixedTest.tranches" + + " -recalFile " + privateTestDir + "VQSR.mixedTest.recal"; + + final WalkerTestSpec spec = new WalkerTestSpec(base, 1, Arrays.asList("")); + spec.disableShadowBCF(); + final File VCF = executeTest("testApplyRecalibrationSnpAndIndelTogether", spec).first.get(0); + + for( final VariantContext VC : GATKVCFUtils.readAllVCs(VCF, new VCFCodec()).getSecond() ) { + if( VC != null ) { + Assert.assertTrue(VC.isNotFiltered()); // there should only be unfiltered records in the output VCF file + } + } + } +} + diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/CalculateGenotypePosteriorsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/CalculateGenotypePosteriorsIntegrationTest.java new file mode 100644 index 000000000..14341e401 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/CalculateGenotypePosteriorsIntegrationTest.java @@ -0,0 +1,69 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +public class CalculateGenotypePosteriorsIntegrationTest extends WalkerTest { + + @Test(enabled = true) + public void testUsingDiscoveredAF() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T CalculateGenotypePosteriors --no_cmdline_in_header" + + " -o %s" + + " -R " + b37KGReference + + " -L 20:10,000,000-10,100,000" + + " -V " + validationDataLocation + "1000G.phase3.broad.withGenotypes.chr20.1MB.vcf", + 1, + Arrays.asList("e1adedc7e1d63e384187b24b7ded4410")); + executeTest("testUsingDiscoveredAF", spec); + } + +} \ No newline at end of file diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/CombineGVCFsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/CombineGVCFsIntegrationTest.java new file mode 100644 index 000000000..03d136290 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/CombineGVCFsIntegrationTest.java @@ -0,0 +1,168 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.Arrays; +import java.util.List; + +public class CombineGVCFsIntegrationTest extends WalkerTest { + public static String baseTestString(String args) { + return "-T CombineGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -V " + + privateTestDir + "gvcfExample1.vcf -V " + privateTestDir + "gvcfExample2.vcf" + args; + } + + @Test + public void testOneStartsBeforeTwoAndEndsAfterwards() throws Exception { + final String cmd = baseTestString(" -L 1:69485-69509"); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("")); + spec.disableShadowBCF(); + final File gVCF = executeTest("testOneStartsBeforeTwoAndEndsAfterwards", spec).first.get(0); + final List allVCs = GATKVCFUtils.readVCF(gVCF).getSecond(); + + Assert.assertEquals(allVCs.size(), 2, "Observed: " + allVCs); + + final VariantContext first = allVCs.get(0); + Assert.assertEquals(first.getStart(), 69491); + Assert.assertEquals(first.getEnd(), 69497); + Assert.assertEquals(first.getGenotypes().size(), 2); + Assert.assertTrue(first.getGenotype("NA1").isCalled()); + Assert.assertTrue(first.getGenotype("NA2").isNoCall()); + + final VariantContext second = allVCs.get(1); + Assert.assertEquals(second.getStart(), 69498); + Assert.assertEquals(second.getEnd(), 69506); + Assert.assertEquals(second.getGenotypes().size(), 2); + Assert.assertTrue(second.getGenotype("NA1").isCalled()); + Assert.assertTrue(second.getGenotype("NA2").isCalled()); + } + + @Test + public void testTwoSpansManyBlocksInOne() throws Exception { + final String cmd = baseTestString(" -L 1:69512-69634"); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("")); + spec.disableShadowBCF(); + final File gVCF = executeTest("testTwoSpansManyBlocksInOne", spec).first.get(0); + final List allVCs = GATKVCFUtils.readVCF(gVCF).getSecond(); + + Assert.assertEquals(allVCs.size(), 5); + } + + @Test + public void testOneHasAltAndTwoHasNothing() throws Exception { + final String cmd = baseTestString(" -L 1:69511"); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("")); + spec.disableShadowBCF(); + final File gVCF = executeTest("testOneHasAltAndTwoHasNothing", spec).first.get(0); + final List allVCs = GATKVCFUtils.readVCF(gVCF).getSecond(); + + Assert.assertEquals(allVCs.size(), 1); + + final VariantContext first = allVCs.get(0); + Assert.assertEquals(first.getStart(), 69511); + Assert.assertEquals(first.getEnd(), 69511); + Assert.assertEquals(first.getGenotypes().size(), 2); + } + + @Test + public void testOneHasAltAndTwoHasRefBlock() throws Exception { + final String cmd = baseTestString(" -L 1:69635"); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("")); + spec.disableShadowBCF(); + final File gVCF = executeTest("testOneHasAltAndTwoHasRefBlock", spec).first.get(0); + final List allVCs = GATKVCFUtils.readVCF(gVCF).getSecond(); + + Assert.assertEquals(allVCs.size(), 1); + + final VariantContext first = allVCs.get(0); + Assert.assertEquals(first.getStart(), 69635); + Assert.assertEquals(first.getEnd(), 69635); + Assert.assertEquals(first.getNAlleles(), 3); + Assert.assertEquals(first.getGenotypes().size(), 2); + } + + @Test + public void testOneHasDeletionAndTwoHasRefBlock() throws Exception { + final String cmd = baseTestString(" -L 1:69772-69783"); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("")); + spec.disableShadowBCF(); + final File gVCF = executeTest("testOneHasDeletionAndTwoHasRefBlock", spec).first.get(0); + final List allVCs = GATKVCFUtils.readVCF(gVCF).getSecond(); + + Assert.assertEquals(allVCs.size(), 3); + + final VariantContext first = allVCs.get(0); + Assert.assertEquals(first.getStart(), 69772); + Assert.assertEquals(first.getEnd(), 69776); + Assert.assertEquals(first.getNAlleles(), 3); + Assert.assertEquals(first.getGenotypes().size(), 2); + + final VariantContext second = allVCs.get(1); + Assert.assertEquals(second.getStart(), 69773); + Assert.assertEquals(second.getEnd(), 69774); + Assert.assertEquals(second.getGenotypes().size(), 2); + + final VariantContext third = allVCs.get(2); + Assert.assertEquals(third.getStart(), 69775); + Assert.assertEquals(third.getEnd(), 69783); + Assert.assertEquals(third.getGenotypes().size(), 2); + } + + @Test + public void testMD5s() throws Exception { + final String cmd = baseTestString(" -L 1:69485-69791"); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("aecdfa9eb32b802cd629e9f811ef15fd")); + spec.disableShadowBCF(); + executeTest("testMD5s", spec); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordanceIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordanceIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordanceIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordanceIntegrationTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeGVCFsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeGVCFsIntegrationTest.java new file mode 100644 index 000000000..1ca23caba --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeGVCFsIntegrationTest.java @@ -0,0 +1,116 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +public class GenotypeGVCFsIntegrationTest extends WalkerTest { + + private static String baseTestString(String args, String ref) { + return "-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + ref + args; + } + + @Test(enabled = true) + public void combineSingleSamplePipelineGVCF() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -V:sample1 " + privateTestDir + "combine.single.sample.pipeline.1.vcf" + + " -V:sample2 " + privateTestDir + "combine.single.sample.pipeline.2.vcf" + + " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + + " -L 20:10,000,000-20,000,000", b37KGReference), + 1, + Arrays.asList("2be5f6f7e7f79841108906555d548683")); + executeTest("combineSingleSamplePipelineGVCF", spec); + } + + @Test(enabled = false) // TODO -- reenable when this option works + public void combineSingleSamplePipelineGVCF_includeNonVariants() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -V:sample1 " + privateTestDir + "combine.single.sample.pipeline.1.vcf" + + " -V:sample2 " + privateTestDir + "combine.single.sample.pipeline.2.vcf" + + " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + + " -inv -L 20:10,000,000-10,010,000", b37KGReference), + 1, + Arrays.asList("de957075796512cb9f333f77515e97d5")); + executeTest("combineSingleSamplePipelineGVCF_includeNonVariants", spec); + } + + @Test(enabled = true) + public void combineSingleSamplePipelineGVCF_addDbsnp() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -V:sample1 " + privateTestDir + "combine.single.sample.pipeline.1.vcf" + + " -V:sample2 " + privateTestDir + "combine.single.sample.pipeline.2.vcf" + + " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + + " -L 20:10,000,000-11,000,000 --dbsnp " + b37dbSNP132, b37KGReference), + 1, + Arrays.asList("e3c7452277898fece54bf60af9588666")); + executeTest("combineSingleSamplePipelineGVCF_addDbsnp", spec); + } + + @Test(enabled = true) + public void testJustOneSample() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T GenotypeGVCFs --no_cmdline_in_header -L 1:69485-69791 -o %s -R " + b37KGReference + + " -V " + privateTestDir + "gvcfExample1.vcf", + 1, + Arrays.asList("bee009201ec3ad7b4f42f913e7ef1367")); + executeTest("testJustOneSample", spec); + } + + @Test(enabled = true) + public void testSamplesWithDifferentLs() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T GenotypeGVCFs --no_cmdline_in_header -L 1:69485-69791 -o %s -R " + b37KGReference + + " -V " + privateTestDir + "gvcfExample1.vcf" + + " -V " + privateTestDir + "gvcfExample2.vcf", + 1, + Arrays.asList("67410d8ac490e3c9d19ba7a4cceaf8fb")); + executeTest("testSamplesWithDifferentLs", spec); + } +} \ No newline at end of file diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariantsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariantsIntegrationTest.java new file mode 100644 index 000000000..e2f17a65f --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariantsIntegrationTest.java @@ -0,0 +1,77 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +/** + * Tests LeftAlignAndTrimVariants + */ +public class LeftAlignAndTrimVariantsIntegrationTest extends WalkerTest { + + @Test + public void testLeftAlignment() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "forLeftAlignVariantsTest.vcf --no_cmdline_in_header", + 1, + Arrays.asList("bcf05f56adbb32a47b6d6b27b327d5c2")); + executeTest("test left alignment", spec); + } + + @Test + public void testLeftAlignmentWithTrimmingAndMultialleliecs() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "forHardLeftAlignVariantsTest.vcf --no_cmdline_in_header -trim -split", + 1, + Arrays.asList("d12468cf08cfd14354f781d5f42b279f")); + executeTest("test left alignment with trimming and hard multiple alleles", spec); + + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariantsUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariantsUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariantsUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariantsUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/PosteriorLikelihoodsUtilsUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/PosteriorLikelihoodsUtilsUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/PosteriorLikelihoodsUtilsUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/PosteriorLikelihoodsUtilsUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/RegenotypeVariantsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/RegenotypeVariantsIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/RegenotypeVariantsIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/RegenotypeVariantsIntegrationTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java new file mode 100644 index 000000000..703c044d4 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -0,0 +1,359 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.testng.annotations.Test; + +import java.util.Arrays; + +public class SelectVariantsIntegrationTest extends WalkerTest { + public static String baseTestString(String args) { + return "-T SelectVariants -R " + b36KGReference + " -L 1 -o %s --no_cmdline_in_header" + args; + } + + @Test + public void testDiscordanceNoSampleSpecified() { + String testFile = privateTestDir + "NA12878.hg19.example1.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + hg19Reference + " -L 20:1012700-1020000 --variant " + + b37hapmapGenotypes + " -disc " + testFile + + " -o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING", + 1, + Arrays.asList("954415f84996d27b07d00855e96d33a2") + ); + spec.disableShadowBCF(); + + executeTest("testDiscordanceNoSampleSpecified--" + testFile, spec); + } + + @Test + public void testRepeatedLineSelection() { + String testfile = privateTestDir + "test.dup.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -sn A -sn B -sn C --variant " + testfile), + 1, + Arrays.asList("125d1c9fa111cd38dfa2ff3900f16b57") + ); + + executeTest("testRepeatedLineSelection--" + testfile, spec); + } + + @Test + public void testDiscordance() { + String testFile = privateTestDir + "NA12878.hg19.example1.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 --variant " + + b37hapmapGenotypes + " -disc " + testFile + + " -o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING", + 1, + Arrays.asList("ca1b5226eaeaffb78d4abd9d2ee10c43") + ); + spec.disableShadowBCF(); + + executeTest("testDiscordance--" + testFile, spec); + } + + @Test + public void testComplexSelection() { + String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile), + 1, + Arrays.asList("4386fbb258dcef4437495a37f5a83c53") + ); + spec.disableShadowBCF(); + executeTest("testComplexSelection--" + testfile, spec); + } + + @Test + public void testComplexSelectionWithNonExistingSamples() { + String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" --ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES -sn A -se '[CDH]' -sn Z -sn T -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile), + 1, + Arrays.asList("4386fbb258dcef4437495a37f5a83c53") + ); + spec.disableShadowBCF(); + executeTest("testComplexSelectionWithNonExistingSamples--" + testfile, spec); + } + + @Test + public void testNonExistingFieldSelection() { + String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -env -ef -select 'foo!=0||DP>0' --variant " + testfile), + 1, + Arrays.asList("44e77cea624cfff2b8acc3a4b30485cb") // should yield empty vcf because the foo!=0 will yield complete expression false + ); + spec.disableShadowBCF(); + executeTest("testNonExistingSelection--" + testfile, spec); + } + + @Test + public void testSampleExclusionFromFileAndSeparateSample() { + String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -xl_sn A -xl_sf " + samplesFile + " --variant " + testfile, + 1, + Arrays.asList("1f5c72951a35667c4bdf1be153787e27") + ); + spec.disableShadowBCF(); + + executeTest("testSampleExclusion--" + testfile, spec); + } + + @Test + public void testSampleExclusionJustFromFile() { + String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -xl_sf " + samplesFile + " --variant " + testfile, + 1, + Arrays.asList("875d7e00ac8081e87ab9fb1b20c83677") + ); + spec.disableShadowBCF(); + + executeTest("testSampleExclusion--" + testfile, spec); + } + + @Test + public void testSampleInclusionWithNonexistingSamples() { + String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -sn A -sn Z -sn Q -sf " + samplesFile + " --variant " + testfile, + 1, + UserException.BadInput.class + ); + spec.disableShadowBCF(); + + executeTest("testSampleInclusionWithNonexistingSamples--" + testfile, spec); + } + + + @Test + public void testConcordance() { + String testFile = privateTestDir + "NA12878.hg19.example1.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 -conc " + + b37hapmapGenotypes + " --variant " + testFile + + " -o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING", + 1, + Arrays.asList("946e7f2e0ae08dc0e931c1634360fc46") + ); + spec.disableShadowBCF(); + + executeTest("testConcordance--" + testFile, spec); + } + + @Test + public void testVariantTypeSelection() { + String testFile = privateTestDir + "complexExample1.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b36KGReference + " -restrictAllelesTo MULTIALLELIC -selectType MIXED --variant " + testFile + " -o %s --no_cmdline_in_header", + 1, + Arrays.asList("ca2b70e3171420b08b0a2659bfe2a794") + ); + + executeTest("testVariantTypeSelection--" + testFile, spec); + } + + @Test + public void testIndelLengthSelection() { + String testFile = privateTestDir + "complexExample1.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b36KGReference + " -selectType INDEL --variant " + testFile + " -o %s --no_cmdline_in_header --maxIndelSize 3", + 1, + Arrays.asList("004589868ca5dc887e2dff876b4cc797") + ); + + executeTest("testIndelLengthSelection--" + testFile, spec); + } + + @Test + public void testUsingDbsnpName() { + String testFile = privateTestDir + "combine.3.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b36KGReference + " -sn NA12892 --variant:dbsnp " + testFile + " -o %s --no_cmdline_in_header", + 1, + Arrays.asList("a554459c9ccafb9812ff6d8c06c11726") + ); + + executeTest("testUsingDbsnpName--" + testFile, spec); + } + + @Test + public void testRemoveMLE() { + String testFile = privateTestDir + "vcfexample.withMLE.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b36KGReference + " -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header", + 1, + Arrays.asList("a554459c9ccafb9812ff6d8c06c11726") + ); + + executeTest("testRemoveMLE--" + testFile, spec); + } + + @Test + public void testKeepOriginalAC() { + String testFile = privateTestDir + "vcfexample.loseAlleleInSelection.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants --keepOriginalAC -R " + b36KGReference + " -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header", + 1, + Arrays.asList("ad7e8b25e431a3229a78cec063876559") + ); + + executeTest("testKeepOriginalAC--" + testFile, spec); + } + + @Test + public void testKeepOriginalACAndENV() { + String testFile = privateTestDir + "vcfexample.loseAlleleInSelection.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants --keepOriginalAC -env -R " + b36KGReference + " -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header", + 1, + Arrays.asList("e9b8292212545684cdb163423329ee7e") + ); + + executeTest("testKeepOriginalACAndENV--" + testFile, spec); + } + + @Test + public void testMultipleRecordsAtOnePosition() { + String testFile = privateTestDir + "selectVariants.onePosition.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b36KGReference + " -select 'KG_FREQ < 0.5' --variant " + testFile + " -o %s --no_cmdline_in_header", + 1, + Arrays.asList("44f7c47395ca5b2afef5313f592c8cea") + ); + + executeTest("testMultipleRecordsAtOnePosition--" + testFile, spec); + } + + @Test + public void testNoGTs() { + String testFile = privateTestDir + "vcf4.1.example.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b37KGReference + " --variant " + testFile + " -o %s --no_cmdline_in_header", + 1, + Arrays.asList("ef3c5f75074a5dd2b2cd2715856a2542") + ); + + executeTest("testNoGTs--" + testFile, spec); + } + + @Test + public void testSelectFromMultiAllelic() { + String testfile = privateTestDir + "multi-allelic.bi-allelicInGIH.vcf"; + String samplesFile = privateTestDir + "GIH.samples.list"; + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b37KGReference + " -o %s --no_cmdline_in_header -sf " + samplesFile + " --excludeNonVariants --variant " + testfile, + 1, + Arrays.asList("69862fb97e8e895fe65c7abb14b03cee") + ); + executeTest("test select from multi allelic with excludeNonVariants --" + testfile, spec); + } + + @Test + public void testMultiAllelicAnnotationOrdering() { + String testfile = privateTestDir + "multi-allelic-ordering.vcf"; + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b37KGReference + " -o %s --no_cmdline_in_header " + + "-sn SAMPLE-CC -sn SAMPLE-CT -sn SAMPLE-CA --excludeNonVariants --variant " + testfile, + 1, + Arrays.asList("8fe7cdca8638461909262cb0769b2527") + ); + executeTest("test multi allelic annotation ordering --" + testfile, spec); + } + + @Test() + public void testFileWithoutInfoLineInHeader() { + testFileWithoutInfoLineInHeader("testFileWithoutInfoLineInHeader", IllegalStateException.class); + } + + @Test() + public void testFileWithoutInfoLineInHeaderWithOverride() { + testFileWithoutInfoLineInHeader("testFileWithoutInfoLineInHeaderWithOverride", null); + } + + private void testFileWithoutInfoLineInHeader(final String name, final Class expectedException) { + final String testFile = privateTestDir + "missingHeaderLine.vcf"; + final String cmd = "-T SelectVariants -R " + b36KGReference + " -sn NA12892 --variant:dbsnp " + + testFile + " -o %s --no_cmdline_in_header" + + (expectedException == null ? " -U LENIENT_VCF_PROCESSING" : ""); + WalkerTestSpec spec = + expectedException != null + ? new WalkerTestSpec(cmd, 1, expectedException) + : new WalkerTestSpec(cmd, 1, Arrays.asList("")); + spec.disableShadowBCF(); + + executeTest(name, spec); + } +} diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsParallelIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsParallelIntegrationTest.java new file mode 100644 index 000000000..68eb1cc41 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsParallelIntegrationTest.java @@ -0,0 +1,110 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.Arrays; + +public class SelectVariantsParallelIntegrationTest extends WalkerTest { + + private class ParallelSelectTestProvider extends TestDataProvider { + final String reference; + final String args; + final String md5; + final int nt; + + private ParallelSelectTestProvider(final String reference, final String args, final String md5, final int nt) { + super(ParallelSelectTestProvider.class); + this.reference = reference; + this.args = args; + this.md5 = md5; + this.nt = nt; + } + + public final String getCmdLine() { + return "-T SelectVariants -R " + reference + " -o %s -L 1 --no_cmdline_in_header -nt " + nt + " " + args; + } + + public String toString() { + return String.format("ParallelSelectVariants nt=%d args=%s", nt, args); + } + } + + @DataProvider(name = "ParallelSelectTest") + public Object[][] makeParallelSelectTestProvider() { + for ( int nt : Arrays.asList(1, 2, 4) ) { + { // original MAF test + String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; + String args = " -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile; + new ParallelSelectTestProvider(b36KGReference, args, "4386fbb258dcef4437495a37f5a83c53", nt); + } + { // new tests on b37 using testdir VCF + final String testfile = privateTestDir + "NA12878.hg19.example1.vcf"; + final String args = "-select 'DP > 30' -V " + testfile; + new ParallelSelectTestProvider(b37KGReference, args, "c64b45a14d41b1e5cddbe036b47e7519", nt); + } + { // AD and PL decoding race condition + final String testfile = privateTestDir + "race_condition.vcf"; + final String args = "-env -sn SAMPLE -L 1:1-10,000,000 -V " + testfile; + new ParallelSelectTestProvider(b37KGReference, args, "62e6156387d6e91bd2b08ef649cb1129", nt); + } + } + + return ParallelSelectTestProvider.getTests(ParallelSelectTestProvider.class); + } + + @Test(dataProvider = "ParallelSelectTest") + public void testParallelSelectTestProvider(final ParallelSelectTestProvider cfg) { + final WalkerTestSpec spec = new WalkerTestSpec( cfg.getCmdLine(), 1, Arrays.asList(cfg.md5) ); + executeTest(cfg.toString(), spec); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToAllelicPrimitivesIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToAllelicPrimitivesIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToAllelicPrimitivesIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToAllelicPrimitivesIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCFIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCFIntegrationTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCFIntegrationTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCFIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/utils/ContigComparatorUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/ContigComparatorUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/ContigComparatorUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/ContigComparatorUnitTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/RandomDNA.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/RandomDNA.java new file mode 100644 index 000000000..426462ed2 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/RandomDNA.java @@ -0,0 +1,125 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.utils; + +import java.util.Random; + +/** + * Random DNA sequence generator. + * + *

+ * Returned bases are always in upper case and one of the valid four nocleotides 'A', 'C', 'G' and 'T'. + *

+ * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class RandomDNA { + + private Random random; + + /** + * Constructs a new random DNA generator. + * + *

+ * The seed would be the default which would depend on system properties and the current time as + * described in {@link Random} documentation. + *

+ */ + @SuppressWarnings("unused") + public RandomDNA() { + random = new Random(); + } + + /** + * Constructs a new random DNA generator providing a seed. + * + * @param seed the random number generator seed. + */ + public RandomDNA(final long seed) { + random = new Random(seed); + } + + /** + * Updates the content of a byte array with a random base sequence. + * + *

+ * The whole array will be filled with new base values. + *

+ * + * @param destination the array to update. + * + * @throws NullPointerException if {@code destination} is {@code null}. + */ + public void nextBases(final byte[] destination) { + random.nextBytes(destination); + for (int i = 0; i < destination.length; i++) { + final int ord = destination[i] & 0x03; + switch (ord) { + case 0: destination[i] = 'A'; break; + case 1: destination[i] = 'C'; break; + case 2: destination[i] = 'G'; break; + case 3: destination[i] = 'T'; break; + default: throw new IllegalStateException("this cannot be happening!!!"); + } + } + } + + /** + * Returns a random RNA sequence of bases. + * @param size the length of the sequence. + * + * @throws IllegalArgumentException if {@code size} is negative. + * @return never {@code null}. + */ + public byte[] nextBases(final int size) { + if (size < 0) throw new IllegalArgumentException("the size cannot be negative"); + final byte[] result = new byte[size]; + nextBases(result); + return result; + } + + +} diff --git a/protected/java/test/org/broadinstitute/sting/utils/collections/CountSetUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/collections/CountSetUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/collections/CountSetUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/collections/CountSetUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/utils/genotyper/DiploidGenotypeUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/genotyper/DiploidGenotypeUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/genotyper/DiploidGenotypeUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/genotyper/DiploidGenotypeUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/utils/genotyper/MostLikelyAlleleUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/genotyper/MostLikelyAlleleUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/genotyper/MostLikelyAlleleUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/genotyper/MostLikelyAlleleUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/gvcf/GVCFWriterUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/gvcf/GVCFWriterUnitTest.java new file mode 100644 index 000000000..cda022ab8 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/gvcf/GVCFWriterUnitTest.java @@ -0,0 +1,392 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.gvcf; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.ReferenceConfidenceModel; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.*; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; +import org.broadinstitute.variant.vcf.VCFConstants; +import org.broadinstitute.variant.vcf.VCFHeader; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +public class GVCFWriterUnitTest extends BaseTest { + private static class MockWriter implements VariantContextWriter { + final List emitted = new ArrayList<>(); + boolean headerWritten = false; + boolean closed = false; + + @Override + public void writeHeader(VCFHeader header) { + headerWritten = true; + } + + @Override + public void close() { + closed = true; + } + + @Override + public void add(VariantContext vc) { + emitted.add(vc); + } + } + + private MockWriter mockWriter; + private List standardPartition = Arrays.asList(1, 10, 20); + private Allele REF = Allele.create("N", true); + private Allele ALT = Allele.create("A"); + private List ALLELES = Arrays.asList(REF, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + private final String SAMPLE_NAME = "XXYYZZ"; + + @BeforeMethod + public void setUp() throws Exception { + mockWriter = new MockWriter(); + } + + @Test + public void testHeaderWriting() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + writer.writeHeader(new VCFHeader()); + Assert.assertTrue(mockWriter.headerWritten); + } + + @Test + public void testClose() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + writer.close(); + Assert.assertTrue(mockWriter.closed); + } + + @Test + public void testCloseWithoutClosingUnderlyingWriter() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + writer.close(false); + Assert.assertFalse(mockWriter.closed); + } + + private VariantContext makeHomRef(final String contig, final int start, final int GQ) { + final VariantContextBuilder vcb = new VariantContextBuilder("test", contig, start, start, ALLELES); + final GenotypeBuilder gb = new GenotypeBuilder(SAMPLE_NAME, Arrays.asList(REF, REF)); + gb.GQ(GQ); + gb.DP(10); + gb.AD(new int[]{1, 2}); + gb.PL(new int[]{0, 10, 100}); + return vcb.genotypes(gb.make()).make(); + } + + private VariantContext makeHomRefAlt(final String contig, final int start, final int GQ) { + final VariantContextBuilder vcb = new VariantContextBuilder("test", contig, start, start, Arrays.asList(REF, ALT)); + final GenotypeBuilder gb = new GenotypeBuilder(SAMPLE_NAME, Arrays.asList(REF, REF)); + gb.GQ(GQ); + gb.DP(10); + gb.AD(new int[]{1, 2}); + gb.PL(new int[]{0, 10, 100}); + return vcb.genotypes(gb.make()).make(); + } + + private VariantContext makeNonRef(final String contig, final int start, final int GQ) { + final VariantContextBuilder vcb = new VariantContextBuilder("test", contig, start, start, Arrays.asList(REF, ALT)); + final GenotypeBuilder gb = new GenotypeBuilder(SAMPLE_NAME, Arrays.asList(REF, ALT)); + gb.GQ(GQ); + gb.DP(10); + gb.AD(new int[]{1, 2}); + gb.PL(new int[]{0, 10, 100}); + return vcb.genotypes(gb.make()).make(); + } + + private VariantContext makeDeletion(final String contig, final int start, final int size) { + final String del = Utils.dupString("A", size); + final String alt = del.substring(0, 1); + final VariantContext vc = GATKVariantContextUtils.makeFromAlleles("test", contig, start, Arrays.asList(del, alt)); + final VariantContextBuilder vcb = new VariantContextBuilder(vc); + final GenotypeBuilder gb = new GenotypeBuilder(SAMPLE_NAME, Arrays.asList(vc.getReference(), vc.getAlternateAllele(0))); + gb.GQ(50); + gb.DP(10); + gb.AD(new int[]{1, 2}); + gb.PL(new int[]{0, 10, 100}); + return vcb.genotypes(gb.make()).make(); + } + + @Test + public void testCloseEmitsLastVariant() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + + writer.add(makeHomRef("20", 1, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 0); + + writer.close(); + Assert.assertTrue(mockWriter.closed); + Assert.assertEquals(mockWriter.emitted.size(), 1); + } + + @Test + public void testCloseDoesntEmitsLastVariantWhenNonRef() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + + writer.add(makeNonRef("20", 1, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 1); + + writer.close(); + Assert.assertTrue(mockWriter.closed); + Assert.assertEquals(mockWriter.emitted.size(), 1); + } + + @Test + public void testCrossingContigBoundaryRef() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + + writer.add(makeHomRef("20", 1, 30)); + writer.add(makeHomRef("20", 2, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 0); + writer.add(makeHomRef("21", 3, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 1); + assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false); + + writer.close(); + Assert.assertEquals(mockWriter.emitted.size(), 2); + assertGoodVC(mockWriter.emitted.get(1), "21", 3, 3, false); + } + + @Test + public void testCrossingContigBoundaryToLowerPositionsRef() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + + writer.add(makeHomRef("20", 30, 30)); + writer.add(makeHomRef("20", 31, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 0); + writer.add(makeHomRef("21", 10, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 1); + assertGoodVC(mockWriter.emitted.get(0), "20", 30, 31, false); + writer.add(makeNonRef("21", 11, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 3); + assertGoodVC(mockWriter.emitted.get(1), "21", 10, 10, false); + assertGoodVC(mockWriter.emitted.get(2), "21", 11, 11, true); + } + + @Test + public void testCrossingContigBoundaryFromNonRefToLowerPositionsRef() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + + writer.add(makeNonRef("20", 20, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 1); + writer.add(makeHomRef("21", 10, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 1); + assertGoodVC(mockWriter.emitted.get(0), "20", 20, 20, true); + writer.add(makeNonRef("21", 11, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 3); + assertGoodVC(mockWriter.emitted.get(1), "21", 10, 10, false); + assertGoodVC(mockWriter.emitted.get(2), "21", 11, 11, true); + } + + @Test + public void testCrossingContigBoundaryNonRef() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + + writer.add(makeHomRef("20", 1, 30)); + writer.add(makeHomRef("20", 2, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 0); + writer.add(makeNonRef("21", 3, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 2); + assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false); + assertGoodVC(mockWriter.emitted.get(1), "21", 3, 3, true); + } + + @Test + public void testCrossingContigBoundaryNonRefThenNonRef() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + + writer.add(makeNonRef("20", 1, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 1); + writer.add(makeNonRef("21", 1, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 2); + assertGoodVC(mockWriter.emitted.get(0), "20", 1, 1, true); + assertGoodVC(mockWriter.emitted.get(1), "21", 1, 1, true); + } + + private void assertGoodVC(final VariantContext vc, final String contig, final int start, final int stop, final boolean nonRef) { + Assert.assertEquals(vc.getChr(), contig); + Assert.assertEquals(vc.getStart(), start); + Assert.assertEquals(vc.getEnd(), stop); + if ( nonRef ) { + Assert.assertNotEquals(vc.getAlternateAllele(0), GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + } else { + Assert.assertEquals(vc.getNAlleles(), 2); + Assert.assertEquals(vc.getAlternateAllele(0), GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + Assert.assertEquals(vc.getAttributeAsInt(VCFConstants.END_KEY, -1), stop); + Assert.assertTrue(vc.hasGenotypes()); + Assert.assertTrue(vc.hasGenotype(SAMPLE_NAME)); + Assert.assertEquals(vc.getGenotypes().size(), 1); + final Genotype g = vc.getGenotype(SAMPLE_NAME); + Assert.assertEquals(g.hasAD(), false); + Assert.assertEquals(g.hasLikelihoods(), true); + Assert.assertEquals(g.hasPL(), true); + Assert.assertEquals(g.getPL().length == 3, true); + Assert.assertEquals(g.hasDP(), true); + Assert.assertEquals(g.hasGQ(), true); + } + } + + @Test + public void testVariantForcesNonRef() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + + writer.add(makeHomRef("20", 1, 30)); + writer.add(makeHomRef("20", 2, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 0); + writer.add(makeNonRef("20", 3, 30)); + writer.add(makeHomRef("20", 4, 30)); + writer.add(makeHomRef("20", 5, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 2); + assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false); + assertGoodVC(mockWriter.emitted.get(1), "20", 3, 3, true); + writer.close(); + assertGoodVC(mockWriter.emitted.get(2), "20", 4, 5, false); + } + + @Test + public void testEmittingTwoBands() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + + writer.add(makeHomRef("20", 1, 0)); + writer.add(makeHomRef("20", 2, 0)); + Assert.assertEquals(mockWriter.emitted.size(), 0); + writer.add(makeHomRef("20", 3, 50)); + writer.add(makeHomRef("20", 4, 50)); + writer.close(); + Assert.assertEquals(mockWriter.emitted.size(), 2); + assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false); + assertGoodVC(mockWriter.emitted.get(1), "20", 3, 4, false); + } + + @Test + public void testNonContiguousBlocks() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + + writer.add(makeHomRef("20", 1, 0)); + writer.add(makeHomRef("20", 2, 0)); + writer.add(makeHomRef("20", 10, 0)); + writer.add(makeHomRef("20", 11, 0)); + writer.close(); + Assert.assertEquals(mockWriter.emitted.size(), 2); + assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false); + assertGoodVC(mockWriter.emitted.get(1), "20", 10, 11, false); + } + + @Test + public void testDeletion() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + + writer.add(makeHomRef("20", 1, 0)); + writer.add(makeHomRef("20", 2, 0)); + writer.add(makeDeletion("20", 3, 3)); + writer.add(makeHomRef("20", 4, 0)); + writer.add(makeHomRef("20", 5, 0)); + writer.add(makeHomRef("20", 6, 0)); + writer.add(makeHomRef("20", 7, 0)); + writer.close(); + Assert.assertEquals(mockWriter.emitted.size(), 3); + assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false); + assertGoodVC(mockWriter.emitted.get(1), "20", 3, 5, true); + assertGoodVC(mockWriter.emitted.get(2), "20", 6, 7, false); + } + + @Test + public void testHomRefAlt() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + + writer.add(makeHomRef("20", 1, 0)); + writer.add(makeHomRef("20", 2, 0)); + writer.add(makeHomRefAlt("20", 3, 0)); + writer.add(makeHomRef("20", 4, 0)); + writer.add(makeHomRef("20", 5, 0)); + writer.add(makeHomRef("20", 6, 0)); + writer.add(makeHomRef("20", 7, 0)); + writer.close(); + Assert.assertEquals(mockWriter.emitted.size(), 3); + assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false); + Assert.assertFalse(mockWriter.emitted.get(1).hasAttribute("END")); + Assert.assertFalse(mockWriter.emitted.get(1).hasAttribute("BLOCK_SIZE")); + assertGoodVC(mockWriter.emitted.get(2), "20", 4, 7, false); + } + + @DataProvider(name = "BandPartitionData") + public Object[][] makeBandPartitionData() { + List tests = new ArrayList<>(); + + tests.add(new Object[]{null, false}); + tests.add(new Object[]{Collections.emptyList(), false}); + tests.add(new Object[]{Arrays.asList(1), true}); + tests.add(new Object[]{Arrays.asList(1, 10), true}); + tests.add(new Object[]{Arrays.asList(1, 10, 30), true}); + tests.add(new Object[]{Arrays.asList(10, 1, 30), false}); + tests.add(new Object[]{Arrays.asList(-1, 1), false}); + tests.add(new Object[]{Arrays.asList(1, null, 10), false}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "BandPartitionData") + public void testMyData(final List partitions, final boolean expectedGood) { + try { + GVCFWriter.parsePartitions(partitions); + Assert.assertTrue(expectedGood, "Expected to fail but didn't"); + } catch ( Exception e ) { + Assert.assertTrue(! expectedGood, "Expected to succeed but failed with message " + e.getMessage()); + } + } +} diff --git a/protected/java/test/org/broadinstitute/sting/utils/gvcf/HomRefBlockUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/gvcf/HomRefBlockUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/gvcf/HomRefBlockUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/gvcf/HomRefBlockUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeBaseComparatorUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/haplotype/HaplotypeBaseComparatorUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeBaseComparatorUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/haplotype/HaplotypeBaseComparatorUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeLDCalculatorUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/haplotype/HaplotypeLDCalculatorUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeLDCalculatorUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/haplotype/HaplotypeLDCalculatorUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeScoreComparatorUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/haplotype/HaplotypeScoreComparatorUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeScoreComparatorUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/haplotype/HaplotypeScoreComparatorUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeSizeAndBaseComparatorUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/haplotype/HaplotypeSizeAndBaseComparatorUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeSizeAndBaseComparatorUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/haplotype/HaplotypeSizeAndBaseComparatorUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/utils/haplotype/LDMergerUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/haplotype/LDMergerUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/haplotype/LDMergerUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/haplotype/LDMergerUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java new file mode 100644 index 000000000..489eff0bc --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java @@ -0,0 +1,99 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.nanoScheduler; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +// ********************************************************************************** // +// Note that this class also serves as an integration test for the VariantAnnotator! // +// ********************************************************************************** // + +public class NanoSchedulerIntegrationTest extends WalkerTest { + @DataProvider(name = "NanoSchedulerUGTest") + public Object[][] createNanoSchedulerUGTest() { + List tests = new ArrayList(); + + for ( final int nt : Arrays.asList(1, 2) ) + for ( final int nct : Arrays.asList(1, 2) ) { +// tests.add(new Object[]{ "SNP", "a1c7546f32a8919a3f3a70a04b2e8322", nt, nct }); +//// tests.add(new Object[]{ "INDEL", "0a6d2be79f4f8a4b0eb788cc4751b31b", nt, nct }); + tests.add(new Object[]{ "BOTH", "392dc99dc279082fc6e729b249adfa2b", nt, nct }); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true, dataProvider = "NanoSchedulerUGTest") + private void testNanoSchedulerUGTest(final String glm, final String md5, final int nt, final int nct ) { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T UnifiedGenotyper -R " + b37KGReference, + "--no_cmdline_in_header -G", + //"--dbsnp " + b37dbSNP132, + "-I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", + "-L 20:10,000,000-10,100,000", + "-glm " + glm, + "--contamination_fraction_to_filter 0.0", + "-nt " + nt, + "-nct " + nct, + "-o %s" + ), + 1, + Arrays.asList(md5) + ); + executeTest(String.format("testUG-glm:%s-nt%d-nct%d", glm, nt, nct), spec); + } + + + +} diff --git a/protected/java/test/org/broadinstitute/sting/utils/pairhmm/ActiveRegionTestDataSet.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/ActiveRegionTestDataSet.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/pairhmm/ActiveRegionTestDataSet.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/ActiveRegionTestDataSet.java diff --git a/protected/java/test/org/broadinstitute/sting/utils/pairhmm/CnyPairHMMUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/CnyPairHMMUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/pairhmm/CnyPairHMMUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/CnyPairHMMUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/utils/pairhmm/FastLoglessPairHMMUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/FastLoglessPairHMMUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/pairhmm/FastLoglessPairHMMUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/FastLoglessPairHMMUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMEmpiricalBenchmark.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/PairHMMEmpiricalBenchmark.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMEmpiricalBenchmark.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/PairHMMEmpiricalBenchmark.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/PairHMMModelUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/PairHMMModelUnitTest.java new file mode 100644 index 000000000..8c54326db --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/PairHMMModelUnitTest.java @@ -0,0 +1,337 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.utils.pairhmm; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.QualityUtils; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.Arrays; +import java.util.Iterator; + + +/** + * Unit tests for {@link PairHMMModel} + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class PairHMMModelUnitTest extends BaseTest { + + final double TOLERANCE = 1E-9; + + @Test(dataProvider="qualToProbsDataProvider") + public void testQualToProbs(final int insQual, final int delQual, final int gcp, final double[] expected) { + final double[] actual = PairHMMModel.qualToTransProbs((byte)insQual,(byte)delQual,(byte)gcp); + Assert.assertNotNull(actual); + Assert.assertEquals(actual.length, PairHMMModel.TRANS_PROB_ARRAY_LENGTH); + assertEqualsDoubleArray(actual,expected,TOLERANCE); + Assert.assertEquals(actual.length, PairHMMModel.TRANS_PROB_ARRAY_LENGTH); + } + + @Test(dataProvider="qualToProbsDataProvider") + public void testQualToProbsLog10(final int insQuals, final int delQual, final int gcp, final double[] expected) { + final double[] logExpected = new double[expected.length]; + for (int i = 0; i < logExpected.length; i++) + logExpected[i] = Math.log10(expected[i]); + final double[] actual = PairHMMModel.qualToTransProbsLog10((byte)insQuals,(byte)delQual,(byte)gcp); + Assert.assertNotNull(actual); + Assert.assertEquals(actual.length, PairHMMModel.TRANS_PROB_ARRAY_LENGTH); + assertEqualsDoubleArray(actual,logExpected,TOLERANCE); + } + + @Test(dataProvider="qualToProbsDataProvider") + public void testQualToProbsFill(final int insQual, final int delQual, final int gcp, final double[] expected) { + final double[] actual = new double[PairHMMModel.TRANS_PROB_ARRAY_LENGTH]; + PairHMMModel.qualToTransProbs(actual, (byte) insQual, (byte) delQual, (byte) gcp); + assertEqualsDoubleArray(actual,expected,TOLERANCE); + } + + @Test(dataProvider="qualToTransDataProvider") + public void testQualsToTransProbs(final byte[] insQuals, final byte[] delQuals, final byte[] gapQuals, final double[][] expected) { + final double[][] actual = PairHMMModel.qualToTransProbs(insQuals,delQuals,gapQuals); + Assert.assertNotNull(actual); + Assert.assertEquals(actual.length,expected.length); + Assert.assertNotNull(actual[0]); + Assert.assertEquals(actual[0].length,expected[0].length); + for (int i = 0; i < actual.length ; i++) + assertEqualsDoubleArray(actual[i],expected[i],TOLERANCE); + } + + @Test(dataProvider="qualToTransDataProvider") + public void testQualsToTransProbsLog10(final byte[] insQuals, final byte[] delQuals, final byte[] gapQuals, final double[][] expected) { + final double[][] actual = PairHMMModel.qualToTransProbsLog10(insQuals,delQuals,gapQuals); + final double[][] logExpected = new double[expected.length][expected[0].length]; + for (int i = 1; i < expected.length; i++) + for (int j = 0; j < expected[0].length; j++) + logExpected[i][j] = Math.log10(expected[i][j]); + Assert.assertNotNull(actual); + Assert.assertEquals(actual.length,logExpected.length); + Assert.assertNotNull(actual[0]); + Assert.assertEquals(actual[0].length,logExpected[0].length); + for (int i = 0; i < actual.length ; i++) + assertEqualsDoubleArray(actual[i],logExpected[i],TOLERANCE); + } + + @Test(dataProvider="qualToTransDataProvider") + public void testQualsToTransProbsLog10Fill(final byte[] insQuals, final byte[] delQuals, final byte[] gapQuals, final double[][] expected) { + final double[][] actual = PairHMMModel.createTransitionMatrix(insQuals.length); + PairHMMModel.qualToTransProbsLog10(actual,insQuals,delQuals,gapQuals); + final double[][] logExpected = new double[expected.length][expected[0].length]; + for (int i = 1; i < expected.length; i++) + for (int j = 0; j < expected[0].length; j++) + logExpected[i][j] = Math.log10(expected[i][j]); + Assert.assertNotNull(actual); + Assert.assertEquals(actual.length,logExpected.length); + Assert.assertNotNull(actual[0]); + Assert.assertEquals(actual[0].length,logExpected[0].length); + for (int i = 0; i < actual.length ; i++) + assertEqualsDoubleArray(actual[i],logExpected[i],TOLERANCE); + } + + @Test(dataProvider="qualToTransDataProvider") + public void testQualsToTransProbsFill(final byte[] insQuals, final byte[] delQuals, final byte[] gapQuals, final double[][] expected) { + final double[][] actual = PairHMMModel.createTransitionMatrix(insQuals.length); + PairHMMModel.qualToTransProbs(actual,insQuals,delQuals,gapQuals); + Assert.assertNotNull(actual); + Assert.assertEquals(actual.length,expected.length); + Assert.assertNotNull(actual[0]); + Assert.assertEquals(actual[0].length,expected[0].length); + for (int i = 0; i < actual.length ; i++) + assertEqualsDoubleArray(actual[i],expected[i],TOLERANCE); + } + @Test(dataProvider="qualToProbsDataProvider") + public void testQualToProbsLog10Fill(final int insQuals, final int delQual, final int gcp, final double[] expected) { + final double[] logExpected = new double[expected.length]; + for (int i = 0; i < logExpected.length; i++) + logExpected[i] = Math.log10(expected[i]); + final double[] actual = new double[PairHMMModel.TRANS_PROB_ARRAY_LENGTH]; + PairHMMModel.qualToTransProbsLog10(actual, (byte) insQuals, (byte) delQual, (byte) gcp); + assertEqualsDoubleArray(actual,logExpected,TOLERANCE); + } + + + @DataProvider(name="qualToTransDataProvider") + public Iterator qualToTransDataProvider() { + return new Iterator() { + + private final Iterator readLengthIterator = readLengthIterator(); + private Iterator qualsIterator = qualIterator(); + + @Override + public boolean hasNext() { + return readLengthIterator.hasNext(); + } + + @Override + public Object[] next() { + final int readLength = readLengthIterator.next(); + double[][] matrix = new double[readLength+1][PairHMMModel.TRANS_PROB_ARRAY_LENGTH]; + final byte[] insQuals = new byte[readLength]; + final byte[] delQuals = new byte[readLength]; + final byte[] gapQuals = new byte[readLength]; + for (int i = 0; i < readLength; i++) { + if (!qualsIterator.hasNext()) + qualsIterator = qualIterator(); + final int[] quals = qualsIterator.next(); + final int insQual = quals[0]; + final int delQual = quals[1]; + final int gapQual = quals[2]; + final double[] trans = qualsToProbs(insQual, delQual, gapQual); + matrix[i+1] = trans; + insQuals[i] = (byte)insQual; + delQuals[i] = (byte)delQual; + gapQuals[i] = (byte)gapQual; + } + + return new Object[] { insQuals, delQuals, gapQuals, matrix }; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + + + @DataProvider(name="qualToProbsDataProvider") + public Iterator qualToProbsDataProvider() { + return new Iterator() { + private final Iterator qualsIterator = qualIterator(); + + @Override + public boolean hasNext() { + return qualsIterator.hasNext(); + } + + @Override + public Object[] next() { + final int[] quals = qualsIterator.next(); + final int insQual = quals[0]; + final int delQual = quals[1]; + final int gapQual = quals[2]; + + final double[] trans = qualsToProbs(insQual, delQual, gapQual); + + + return new Object[] { insQual, delQual, gapQual, trans }; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + + private double[] qualsToProbs(final int insQual, final int delQual, final int gapQual) { + final double[] trans = new double[PairHMMModel.TRANS_PROB_ARRAY_LENGTH]; + final double matchToMatch = PairHMMModel.matchToMatchProb(insQual, delQual); + final double matchToInsert = QualityUtils.qualToErrorProb(insQual); + final double matchToDeletion = QualityUtils.qualToErrorProb(delQual); + final double indelToMatch = QualityUtils.qualToProb(gapQual); + final double indelToIndel = QualityUtils.qualToErrorProb(gapQual); + + trans[PairHMMModel.matchToMatch] = matchToMatch; + trans[PairHMMModel.matchToInsertion] = matchToInsert; + trans[PairHMMModel.matchToDeletion] = matchToDeletion; + trans[PairHMMModel.indelToMatch] = indelToMatch; + trans[PairHMMModel.deletionToDeletion] = trans[PairHMMModel.insertionToInsertion] = indelToIndel; + return trans; + } + + private Iterator readLengthIterator() { + return Arrays.asList(READ_LENGTHS).iterator(); + } + + private Iterator qualIterator() { + final int totalCount = INS_QUALS.length * DEL_QUALS.length * GAP_QUALS.length; + + return new Iterator() { + + private int i = 0; + + @Override + public boolean hasNext() { + return i < totalCount; + } + + @Override + public int[] next() { + final int gap = i % GAP_QUALS.length; + final int indelGroup = i / GAP_QUALS.length; + final int del = indelGroup % DEL_QUALS.length; + final int ins = indelGroup % DEL_QUALS.length; + i++; + return new int[] { INS_QUALS[ins], DEL_QUALS[del], GAP_QUALS[gap]}; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + + + + @Test(dataProvider = "dualTestDataProvider") + public void testDoubleQualToProb(final int insQual, final int delQual, final double log10Expected, final double expected) { + Assert.assertEquals(PairHMMModel.matchToMatchProb(insQual, delQual),expected,TOLERANCE); + Assert.assertEquals(PairHMMModel.matchToMatchProbLog10(insQual, delQual),log10Expected,TOLERANCE); + Assert.assertEquals(PairHMMModel.matchToMatchProb((byte) insQual, (byte) delQual),expected,TOLERANCE); + Assert.assertEquals(PairHMMModel.matchToMatchProbLog10((byte) insQual, (byte) delQual),log10Expected,TOLERANCE); + } + + @DataProvider(name = "dualTestDataProvider") + private Iterator dualTestDataProvider() { + final int[] testQuals = new int[] { 0, 1, 2, 5, 10, 13, 17, 20, 23, 27, 30, 43, 57, 70, 100, 200, 254}; + + return new Iterator() { + private int i = 0; + private int j = 0; + + @Override + public Object[] next() { + + final int qual1 = testQuals[i]; + final int qual2 = testQuals[j]; + + final double errorProb1 = Math.pow(10,- 0.1 * qual1); + final double errorProb2 = Math.pow(10,- 0.1 * qual2); + final double expected = Math.max(0, (1 - (errorProb1 + errorProb2))); + final Object[] result = new Object[] { qual1, qual2,Math.log10(Math.min(1,expected)),Math.min(1, expected)}; + + if (++j >= testQuals.length) { + i++; + j = i; + } + return result; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + + @Override + public boolean hasNext() { + return i < testQuals.length; + } + }; + } + + + private static int[] INS_QUALS = {30, 45, 20, 10, 5, 60, 123 }; + + private static int[] DEL_QUALS = {30, 45, 20, 10, 5, 60, 123 }; + + private static int[] GAP_QUALS = {10, 20, 5}; + + private static Integer[] READ_LENGTHS = { 0, 1, 5, 20, 100, 250}; +} diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/PairHMMProbabilityBugIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/PairHMMProbabilityBugIntegrationTest.java new file mode 100644 index 000000000..69100bcdd --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/PairHMMProbabilityBugIntegrationTest.java @@ -0,0 +1,86 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.utils.pairhmm; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.Arrays; + +/** + * Test for the Prob > 1 bug in PairHMM using callers. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class PairHMMProbabilityBugIntegrationTest extends WalkerTest { + + private static final File REFERENCE = new File("/humgen/gsa-hpprojects/GATK/bundle/current/hg19/ucsc.hg19.fasta").getAbsoluteFile(); + private static final File BAM = new File ("private/testdata", "pairhmm_prob_bug.bam").getAbsoluteFile(); + private static final File INTERVAL = new File ("private/testdata", "pairhmm_prob_bug.interval.bed").getAbsoluteFile(); + + private static final File UG_BAM = new File("private/testdata", "pairhmm_prob_bug.ug.bam").getAbsoluteFile(); + private static final File UG_INTERVAL = new File("private/testdata", "pairhmm_prob_bug.ug.intervals.bed").getAbsoluteFile(); + + + @Test + public void testHaplotypeCaller() { + final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s", + REFERENCE,BAM,INTERVAL); + final String name = getClass().getSimpleName() + ".testHaplotypeCaller"; + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("")); + executeTest(name, spec); + } + + @Test + public void testUnifiedGenotyper() { + final String commandLine = String.format("-T UnifiedGenotyper -R %s -I %s -L %s -dcov 200 -glm INDEL", + REFERENCE,UG_BAM,UG_INTERVAL); + final String name = getClass().getSimpleName() + ".testUnifiedGenotyper"; + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("")); + executeTest(name, spec); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMSyntheticBenchmark.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/PairHMMSyntheticBenchmark.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMSyntheticBenchmark.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/PairHMMSyntheticBenchmark.java diff --git a/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMTestData.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/PairHMMTestData.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMTestData.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/PairHMMTestData.java diff --git a/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/ContextCovariateUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/ContextCovariateUnitTest.java new file mode 100644 index 000000000..7e2581c51 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/ContextCovariateUnitTest.java @@ -0,0 +1,117 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.recalibration; + +import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; +import org.broadinstitute.sting.utils.recalibration.covariates.ContextCovariate; +import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; +import org.broadinstitute.sting.utils.clipping.ClippingRepresentation; +import org.broadinstitute.sting.utils.clipping.ReadClipper; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +/** + * @author Mauricio Carneiro + * @since 3/1/12 + */ +public class ContextCovariateUnitTest { + ContextCovariate covariate; + RecalibrationArgumentCollection RAC; + + @BeforeClass + public void init() { + RAC = new RecalibrationArgumentCollection(); + covariate = new ContextCovariate(); + covariate.initialize(RAC); + } + + @BeforeMethod + public void initCache() { + ReadCovariates.clearKeysCache(); + } + + @Test(enabled = true) + public void testSimpleContexts() { + GATKSAMRecord read = ReadUtils.createRandomRead(1000); + GATKSAMRecord clippedRead = ReadClipper.clipLowQualEnds(read, RAC.LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS); + ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); + covariate.recordValues(read, readCovariates); + + verifyCovariateArray(readCovariates.getMismatchesKeySet(), RAC.MISMATCHES_CONTEXT_SIZE, clippedRead, covariate); + verifyCovariateArray(readCovariates.getInsertionsKeySet(), RAC.INDELS_CONTEXT_SIZE, clippedRead, covariate); + verifyCovariateArray(readCovariates.getDeletionsKeySet(), RAC.INDELS_CONTEXT_SIZE, clippedRead, covariate); + } + + public static void verifyCovariateArray(int[][] values, int contextSize, GATKSAMRecord read, Covariate contextCovariate) { + for (int i = 0; i < values.length; i++) + Assert.assertEquals(contextCovariate.formatKey(values[i][0]), expectedContext(read, i, contextSize)); + + } + + public static String expectedContext (GATKSAMRecord read, int offset, int contextSize) { + final String bases = stringFrom(read.getReadBases()); + String expectedContext = null; + if (offset - contextSize + 1 >= 0) { + String context = bases.substring(offset - contextSize + 1, offset + 1); + if (!context.contains("N")) + expectedContext = context; + } + return expectedContext; + } + + private static String stringFrom(byte[] array) { + String s = ""; + for (byte value : array) + s += (char) value; + return s; + } + +} diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/CycleCovariateUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/CycleCovariateUnitTest.java new file mode 100644 index 000000000..4f8a70cc9 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/CycleCovariateUnitTest.java @@ -0,0 +1,136 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.recalibration; + +import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.recalibration.covariates.CycleCovariate; +import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +/** + * @author Mauricio Carneiro + * @since 3/1/12 + */ +public class CycleCovariateUnitTest { + CycleCovariate covariate; + RecalibrationArgumentCollection RAC; + + @BeforeClass + public void init() { + RAC = new RecalibrationArgumentCollection(); + covariate = new CycleCovariate(); + covariate.initialize(RAC); + } + + @BeforeMethod + public void initCache() { + ReadCovariates.clearKeysCache(); + } + + @Test(enabled = true) + public void testSimpleCycles() { + short readLength = 10; + GATKSAMRecord read = ReadUtils.createRandomRead(readLength); + read.setReadPairedFlag(true); + read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID")); + read.getReadGroup().setPlatform("illumina"); + + ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); + covariate.recordValues(read, readCovariates); + verifyCovariateArray(readCovariates.getMismatchesKeySet(), 1, (short) 1); + + read.setReadNegativeStrandFlag(true); + covariate.recordValues(read, readCovariates); + verifyCovariateArray(readCovariates.getMismatchesKeySet(), readLength, -1); + + read.setSecondOfPairFlag(true); + covariate.recordValues(read, readCovariates); + verifyCovariateArray(readCovariates.getMismatchesKeySet(), -readLength, 1); + + read.setReadNegativeStrandFlag(false); + covariate.recordValues(read, readCovariates); + verifyCovariateArray(readCovariates.getMismatchesKeySet(), -1, -1); + } + + private void verifyCovariateArray(int[][] values, int init, int increment) { + for (short i = 0; i < values.length; i++) { + short actual = Short.decode(covariate.formatKey(values[i][0])); + int expected = init + (increment * i); + Assert.assertEquals(actual, expected); + } + } + + @Test(enabled = true, expectedExceptions={UserException.class}) + public void testMoreThanMaxCycleFails() { + int readLength = RAC.MAXIMUM_CYCLE_VALUE + 1; + GATKSAMRecord read = ReadUtils.createRandomRead(readLength); + read.setReadPairedFlag(true); + read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID")); + read.getReadGroup().setPlatform("illumina"); + + ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); + covariate.recordValues(read, readCovariates); + } + + @Test(enabled = true) + public void testMaxCyclePasses() { + int readLength = RAC.MAXIMUM_CYCLE_VALUE; + GATKSAMRecord read = ReadUtils.createRandomRead(readLength); + read.setReadPairedFlag(true); + read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID")); + read.getReadGroup().setPlatform("illumina"); + + ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); + covariate.recordValues(read, readCovariates); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/utils/recalibration/QualQuantizerUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/QualQuantizerUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/recalibration/QualQuantizerUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/QualQuantizerUnitTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/ReadCovariatesUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/ReadCovariatesUnitTest.java new file mode 100644 index 000000000..eea8aa8f3 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/ReadCovariatesUnitTest.java @@ -0,0 +1,143 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.recalibration; + +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; +import org.broadinstitute.sting.utils.recalibration.covariates.*; +import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import java.util.Random; + +/** + * @author carneiro + * @since 4/21/12 + */ +public class ReadCovariatesUnitTest { + + @BeforeMethod + public void init() { + ReadCovariates.clearKeysCache(); + } + + @Test(enabled = false) + public void testCovariateGeneration() { + final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); + final String RGID = "id"; + + ReadGroupCovariate rgCov = new ReadGroupCovariate(); + QualityScoreCovariate qsCov = new QualityScoreCovariate(); + ContextCovariate coCov = new ContextCovariate(); + CycleCovariate cyCov = new CycleCovariate(); + + rgCov.initialize(RAC); + qsCov.initialize(RAC); + coCov.initialize(RAC); + cyCov.initialize(RAC); + + Covariate[] requestedCovariates = new Covariate[4]; + requestedCovariates[0] = rgCov; + requestedCovariates[1] = qsCov; + requestedCovariates[2] = coCov; + requestedCovariates[3] = cyCov; + + final int NUM_READS = 100; + final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + + final String[] readGroups = {"RG1", "RG2", "RGbla"}; + for (int idx = 0; idx < NUM_READS; idx++) { + for (final String rgs : readGroups) { + final int length = 10 + rnd.nextInt(100); // random read length, at least 10 bp long + final GATKSAMRecord read = ReadUtils.createRandomRead(length, false); + final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(rgs); + rg.setPlatform("illumina"); + read.setReadGroup(rg); + read.setReadNegativeStrandFlag(rnd.nextBoolean()); + final byte[] mQuals = read.getBaseQualities(EventType.BASE_SUBSTITUTION); + final byte[] iQuals = read.getBaseQualities(EventType.BASE_INSERTION); + final byte[] dQuals = read.getBaseQualities(EventType.BASE_DELETION); + ReadCovariates rc = RecalUtils.computeCovariates(read, requestedCovariates); + + // check that the length is correct + Assert.assertEquals(rc.getMismatchesKeySet().length, length); + Assert.assertEquals(rc.getInsertionsKeySet().length, length); + Assert.assertEquals(rc.getDeletionsKeySet().length, length); + + for (int i = 0; i < length; i++) { + // check that read group is always the same + Assert.assertEquals(rgCov.formatKey(rc.getMismatchesKeySet(i)[0]), rgs); + Assert.assertEquals(rgCov.formatKey(rc.getInsertionsKeySet(i)[0]), rgs); + Assert.assertEquals(rgCov.formatKey(rc.getDeletionsKeySet(i)[0]), rgs); + + // check quality score + Assert.assertEquals(qsCov.formatKey(rc.getMismatchesKeySet(i)[1]), "" + mQuals[i]); + Assert.assertEquals(qsCov.formatKey(rc.getInsertionsKeySet(i)[1]), "" + iQuals[i]); + Assert.assertEquals(qsCov.formatKey(rc.getDeletionsKeySet(i)[1]), "" + dQuals[i]); + + // check context + Assert.assertEquals(coCov.formatKey(rc.getMismatchesKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.MISMATCHES_CONTEXT_SIZE)); + Assert.assertEquals(coCov.formatKey(rc.getInsertionsKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.INDELS_CONTEXT_SIZE)); + Assert.assertEquals(coCov.formatKey(rc.getDeletionsKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.INDELS_CONTEXT_SIZE)); + + // check cycle + Assert.assertEquals(cyCov.formatKey(rc.getMismatchesKeySet(i)[3]), "" + (i+1)); + Assert.assertEquals(cyCov.formatKey(rc.getInsertionsKeySet(i)[3]), "" + (i+1)); + Assert.assertEquals(cyCov.formatKey(rc.getDeletionsKeySet(i)[3]), "" + (i+1)); + } + + } + + } + + } + +} diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/ReadGroupCovariateUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/ReadGroupCovariateUnitTest.java new file mode 100644 index 000000000..a8366ce5c --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/ReadGroupCovariateUnitTest.java @@ -0,0 +1,121 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.recalibration; + +import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; +import org.broadinstitute.sting.utils.recalibration.covariates.ReadGroupCovariate; +import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +/** + * @author Mauricio Carneiro + * @since 3/1/12 + */ +public class ReadGroupCovariateUnitTest { + ReadGroupCovariate covariate; + RecalibrationArgumentCollection RAC; + + @BeforeClass + public void init() { + RAC = new RecalibrationArgumentCollection(); + covariate = new ReadGroupCovariate(); + covariate.initialize(RAC); + } + + @BeforeMethod + public void initCache() { + ReadCovariates.clearKeysCache(); + } + + @Test(enabled = true) + public void testSingleRecord() { + final String expected = "SAMPLE.1"; + GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("MY.ID"); + rg.setPlatformUnit(expected); + runTest(rg, expected, covariate); + } + + @Test(enabled = true) + public void testMissingPlatformUnit() { + final String expected = "MY.7"; + GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(expected); + runTest(rg, expected, covariate); + } + + @Test(enabled = true) + public void testForceReadgroup() { + final RecalibrationArgumentCollection forcedRAC = new RecalibrationArgumentCollection(); + forcedRAC.FORCE_READGROUP = "FOO"; + final ReadGroupCovariate forcedCovariate = new ReadGroupCovariate(); + forcedCovariate.initialize(forcedRAC); + + final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("NOT_FOO"); + runTest(rg, "FOO", forcedCovariate); + } + + private static void runTest(final GATKSAMReadGroupRecord rg, final String expected, final ReadGroupCovariate covariate) { + GATKSAMRecord read = ReadUtils.createRandomRead(10); + read.setReadGroup(rg); + ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); + covariate.recordValues(read, readCovariates); + verifyCovariateArray(readCovariates.getMismatchesKeySet(), expected, covariate); + + } + + private static void verifyCovariateArray(final int[][] values, final String expected, final ReadGroupCovariate covariate) { + for (int[] value : values) { + String actual = covariate.formatKey(value[0]); + Assert.assertEquals(actual, expected); + } + } + +} diff --git a/protected/java/test/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/utils/recalibration/RecalUtilsUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/RecalUtilsUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/recalibration/RecalUtilsUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/RecalUtilsUnitTest.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java new file mode 100644 index 000000000..d3c3ffe97 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java @@ -0,0 +1,171 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.recalibration; + +import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; +import org.broadinstitute.sting.utils.recalibration.covariates.*; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.collections.NestedIntegerArray; +import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import java.util.*; + +/** + * @author carneiro + * @since 4/21/12 + */ +public class RecalibrationReportUnitTest { + @BeforeMethod + public void init() { + ReadCovariates.clearKeysCache(); + } + + private static RecalDatum createRandomRecalDatum(int maxObservations, int maxErrors) { + final Random random = new Random(); + final int nObservations = random.nextInt(maxObservations); + final int nErrors = Math.min(random.nextInt(maxErrors), nObservations); + final int qual = random.nextInt(QualityUtils.MAX_SAM_QUAL_SCORE); + return new RecalDatum((long)nObservations, (double)nErrors, (byte)qual); + } + + @Test + public void testOutput() { + final int length = 100; + + List quals = new ArrayList(QualityUtils.MAX_SAM_QUAL_SCORE + 1); + List counts = new ArrayList(QualityUtils.MAX_SAM_QUAL_SCORE + 1); + + for (int i = 0; i<= QualityUtils.MAX_SAM_QUAL_SCORE; i++) { + quals.add((byte) i); + counts.add(1L); + } + + final QuantizationInfo quantizationInfo = new QuantizationInfo(quals, counts); + final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); + + quantizationInfo.noQuantization(); + final List requiredCovariates = new LinkedList(); + final List optionalCovariates = new LinkedList(); + + final ReadGroupCovariate rgCovariate = new ReadGroupCovariate(); + rgCovariate.initialize(RAC); + requiredCovariates.add(rgCovariate); + + final QualityScoreCovariate qsCovariate = new QualityScoreCovariate(); + qsCovariate.initialize(RAC); + requiredCovariates.add(qsCovariate); + + final ContextCovariate cxCovariate = new ContextCovariate(); + cxCovariate.initialize(RAC); + optionalCovariates.add(cxCovariate); + final CycleCovariate cyCovariate = new CycleCovariate(); + cyCovariate.initialize(RAC); + optionalCovariates.add(cyCovariate); + + final Covariate[] requestedCovariates = new Covariate[requiredCovariates.size() + optionalCovariates.size()]; + int covariateIndex = 0; + for (final Covariate cov : requiredCovariates) + requestedCovariates[covariateIndex++] = cov; + for (final Covariate cov : optionalCovariates) + requestedCovariates[covariateIndex++] = cov; + + final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("id"); + rg.setPlatform("illumina"); + final GATKSAMRecord read = ReadUtils.createRandomRead(length, false); + read.setReadGroup(rg); + final byte [] readQuals = new byte[length]; + for (int i = 0; i < length; i++) + readQuals[i] = 20; + read.setBaseQualities(readQuals); + + final int expectedKeys = expectedNumberOfKeys(length, RAC.INDELS_CONTEXT_SIZE, RAC.MISMATCHES_CONTEXT_SIZE); + int nKeys = 0; // keep track of how many keys were produced + final ReadCovariates rc = RecalUtils.computeCovariates(read, requestedCovariates); + + final RecalibrationTables recalibrationTables = new RecalibrationTables(requestedCovariates); + final NestedIntegerArray rgTable = recalibrationTables.getReadGroupTable(); + final NestedIntegerArray qualTable = recalibrationTables.getQualityScoreTable(); + + for (int offset = 0; offset < length; offset++) { + + for (EventType errorMode : EventType.values()) { + + final int[] covariates = rc.getKeySet(offset, errorMode); + final int randomMax = errorMode == EventType.BASE_SUBSTITUTION ? 10000 : 100000; + + rgTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], errorMode.ordinal()); + qualTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], covariates[1], errorMode.ordinal()); + nKeys += 2; + for (int j = 0; j < optionalCovariates.size(); j++) { + final NestedIntegerArray covTable = recalibrationTables.getTable(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal() + j); + final int covValue = covariates[RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal() + j]; + if ( covValue >= 0 ) { + covTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], covariates[1], covValue, errorMode.ordinal()); + nKeys++; + } + } + } + } + Assert.assertEquals(nKeys, expectedKeys); + } + + private static int expectedNumberOfKeys (int readLength, int indelContextSize, int mismatchesContextSize) { + final int numCovariates = 4; + final int numTables = 3; + final int mismatchContextPadding = mismatchesContextSize - 1; + final int indelContextPadding = 2 * (indelContextSize - 1); + final int indelCyclePadding = 2 * (2 * CycleCovariate.CUSHION_FOR_INDELS); + + return (numCovariates * numTables * readLength) - mismatchContextPadding - indelContextPadding - indelCyclePadding; + } + +} diff --git a/protected/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationTablesUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/RecalibrationTablesUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationTablesUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/RecalibrationTablesUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationTestUtils.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/RecalibrationTestUtils.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationTestUtils.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/RecalibrationTestUtils.java diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/RepeatCovariatesUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/RepeatCovariatesUnitTest.java new file mode 100644 index 000000000..74cb2a1eb --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/recalibration/RepeatCovariatesUnitTest.java @@ -0,0 +1,245 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.recalibration; + +import com.google.java.contract.Requires; +import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; +import org.broadinstitute.sting.utils.recalibration.covariates.*; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.collections.Pair; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Random; + +public class RepeatCovariatesUnitTest { + + RepeatLengthCovariate rlCovariate; + RepeatUnitCovariate ruCovariate; + RepeatUnitAndLengthCovariate rurlCovariate; + RecalibrationArgumentCollection RAC; + + + + @BeforeClass + public void init() { + RAC = new RecalibrationArgumentCollection(); + rlCovariate = new RepeatLengthCovariate(); + ruCovariate = new RepeatUnitCovariate(); + rurlCovariate = new RepeatUnitAndLengthCovariate(); + rlCovariate.initialize(RAC); + ruCovariate.initialize(RAC); + rurlCovariate.initialize(RAC); + } + + @BeforeMethod + public void initCache() { + ReadCovariates.clearKeysCache(); + } + + + @Test + public void testFindNumberOfRepetitions() { + // First, test logic to compute number of repetitions of a substring on a given string. + int result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACAC".getBytes(), true); + Assert.assertEquals(2,result); + result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACACACAC".getBytes(), true); + Assert.assertEquals(4,result); + result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACACACACGT".getBytes(), true); + Assert.assertEquals(4,result); + result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "GTACACACAC".getBytes(), true); + Assert.assertEquals(0,result); + result = GATKVariantContextUtils.findNumberofRepetitions("GCA".getBytes(), "GTAGGGT".getBytes(), true); + Assert.assertEquals(0,result); + result = GATKVariantContextUtils.findNumberofRepetitions("GCAGCA".getBytes(), "GCAGCAGTAGGGTGTACACACAC".getBytes(), true); + Assert.assertEquals(1,result); + result = GATKVariantContextUtils.findNumberofRepetitions("GCAGCA".getBytes(), "GTAGGGTGTACACACACGCAGCAT".getBytes(), true); + Assert.assertEquals(0,result); + result = GATKVariantContextUtils.findNumberofRepetitions("GCA".getBytes(), "GTAGGGTGTACACACACGCAGCAGCA".getBytes(), true); + Assert.assertEquals(0,result); + // Same tests but looking backward on string + result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACAC".getBytes(), false); + Assert.assertEquals(2,result); + result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACACACAC".getBytes(), false); + Assert.assertEquals(4,result); + result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACACACACGT".getBytes(), false); + Assert.assertEquals(0,result); + result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "GTACACACAC".getBytes(), false); + Assert.assertEquals(4,result); + result = GATKVariantContextUtils.findNumberofRepetitions("GCA".getBytes(), "GTAGGGT".getBytes(), false); + Assert.assertEquals(0,result); + result = GATKVariantContextUtils.findNumberofRepetitions("GCAGCA".getBytes(), "GCAGCAGTAGGGTGTACACACAC".getBytes(), false); + Assert.assertEquals(0,result); + result = GATKVariantContextUtils.findNumberofRepetitions("GCAGCA".getBytes(), "GTAGGGTGTACACACACGCAGCAT".getBytes(), false); + Assert.assertEquals(0,result); + result = GATKVariantContextUtils.findNumberofRepetitions("GCA".getBytes(), "GTAGGGTGTACACACACGCAGCAGCA".getBytes(), false); + Assert.assertEquals(3,result); + + // test logic to get repeat unit and number of repeats from covariate value + final String[] repUnits = new String[]{"AG","CCG","TCCA","T"}; + for (String ru : repUnits) { + for (int k=1; k < 10; k++) { + Pair pair = RepeatLengthCovariate.getRUandNRfromCovariate(String.format("%s%d",ru,k)); + Assert.assertEquals(pair.second.intValue(),k); + Assert.assertEquals(pair.first,ru); + } + } + + } + + /** + * Build synthetic reads with random content made up of tandem repeats, record computed Repeat Unit and # repeats and see if + * they match with read context + */ + @Test + public void testManyObservations() { + final int NUM_UNITS = 10; + final int MAX_REPEAT_UNIT_LENGTH = RAC.MAX_STR_UNIT_LENGTH; + final int MAX_NUM_REPETITIONS = RAC.MAX_REPEAT_LENGTH; + final int NUM_TEST_CASES = 100; + + Random random = new Random(); + + for (int r = 0; r < NUM_TEST_CASES; r++) { + final StringBuilder sb = new StringBuilder(); + // for each unit, generate a repeat unit at random with given random length + final ArrayList repeatUnits = new ArrayList(); + final ArrayList numsRepetitions = new ArrayList(); + for (int n=0; n < NUM_UNITS; n++) { + final int repLength = 1+random.nextInt(MAX_REPEAT_UNIT_LENGTH); + final String repeatUnit = getRandomBases(repLength); + final int numRepetitions = 1+random.nextInt(MAX_NUM_REPETITIONS); + + // log for comparison with covariate + numsRepetitions.add(numRepetitions); + repeatUnits.add(repeatUnit); + + for (int k=0; k < numRepetitions; k++) + sb.append(repeatUnit); + + } + + final String readBases = sb.toString(); + System.out.println(readBases); + final int readLength = readBases.length(); + + final byte[] readQuals = new byte[readLength]; + Arrays.fill(readQuals,(byte)30); + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(readBases.getBytes(),readQuals,readLength+"M"); + + Covariate[] requestedCovariates = new Covariate[3]; + requestedCovariates[0] = rlCovariate; + requestedCovariates[1] = ruCovariate; + requestedCovariates[2] = rurlCovariate; + ReadCovariates rc = RecalUtils.computeCovariates(read, requestedCovariates); + + // check that the length is correct + Assert.assertEquals(rc.getMismatchesKeySet().length, readLength); + Assert.assertEquals(rc.getInsertionsKeySet().length, readLength); + Assert.assertEquals(rc.getDeletionsKeySet().length, readLength); + + for (int offset = 0; offset < readBases.length(); offset++) { // recalibrate all bases in the read + // check RepeatLength + final String rlValM = rlCovariate.formatKey(rc.getMismatchesKeySet(offset)[0]); + final String rlValI = rlCovariate.formatKey(rc.getInsertionsKeySet(offset)[0]); + final String rlValD = rlCovariate.formatKey(rc.getDeletionsKeySet(offset)[0]); + // check RepeatUnit + final String ruValM = ruCovariate.formatKey(rc.getMismatchesKeySet(offset)[1]); + final String ruValI = ruCovariate.formatKey(rc.getInsertionsKeySet(offset)[1]); + final String ruValD = ruCovariate.formatKey(rc.getDeletionsKeySet(offset)[1]); + // check RepeatUnitAndLength + final String rurlValM = rurlCovariate.formatKey(rc.getMismatchesKeySet(offset)[2]); + final String rurlValI = rurlCovariate.formatKey(rc.getInsertionsKeySet(offset)[2]); + final String rurlValD = rurlCovariate.formatKey(rc.getDeletionsKeySet(offset)[2]); + // check all 3 values are identical + Assert.assertEquals(rlValD,rlValI); + Assert.assertEquals(rlValM,rlValI); + Assert.assertEquals(ruValD,ruValI); + Assert.assertEquals(ruValM,ruValI); + Assert.assertEquals(rurlValD,rurlValI); + Assert.assertEquals(rurlValM,rurlValI); + + + int fw = GATKVariantContextUtils.findNumberofRepetitions(ruValM.getBytes(), readBases.substring(offset+1,readLength).getBytes(),true); + int bw = GATKVariantContextUtils.findNumberofRepetitions(ruValM.getBytes(), readBases.substring(0,offset+1).getBytes(),false); + Assert.assertEquals(Math.min(fw+bw,RAC.MAX_REPEAT_LENGTH),(int)Integer.valueOf(rlValM)); + } + + } + + + + + + + } + + /** + * Returns random bases of given length + * @param length required length + * @return given random string + */ + @Requires("length > 0") + String getRandomBases(final int length) { + byte[] bases = new byte[length]; + Random ran = new Random(); + for (int i=0; i < length; i++ ) { + int idx = ran.nextInt(4); + bases[i] = BaseUtils.baseIndexToSimpleBase(idx); + } + return new String(bases); + } + + +} diff --git a/protected/java/test/org/broadinstitute/sting/utils/smithwaterman/GlobalEdgeGreedySWPairwiseAlignmentUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/smithwaterman/GlobalEdgeGreedySWPairwiseAlignmentUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/smithwaterman/GlobalEdgeGreedySWPairwiseAlignmentUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/smithwaterman/GlobalEdgeGreedySWPairwiseAlignmentUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignmentUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignmentUnitTest.java similarity index 100% rename from protected/java/test/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignmentUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignmentUnitTest.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java deleted file mode 100644 index 95be967a2..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ /dev/null @@ -1,457 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.annotator; - -import cern.jet.math.Arithmetic; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; -import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.variant.variantcontext.Genotype; -import org.broadinstitute.variant.variantcontext.GenotypesContext; -import org.broadinstitute.variant.vcf.VCFHeaderLineType; -import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.broadinstitute.variant.variantcontext.Allele; -import org.broadinstitute.variant.variantcontext.VariantContext; - -import java.util.*; - - -/** - * Phred-scaled p-value using Fisher's Exact Test to detect strand bias - * - *

Phred-scaled p-value using Fisher's Exact Test to detect strand bias (the variation - * being seen on only the forward or only the reverse strand) in the reads. More bias is - * indicative of false positive calls. - *

- * - *

Caveat

- *

The Fisher Strand test may not be calculated for certain complex indel cases or for multi-allelic sites.

- */ -public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { - private final static boolean ENABLE_DEBUGGING = false; - private final static Logger logger = Logger.getLogger(FisherStrand.class); - - private static final String FS = "FS"; - private static final double MIN_PVALUE = 1E-320; - private static final int MIN_QUAL_FOR_FILTERED_TEST = 17; - - public Map annotate(final RefMetaDataTracker tracker, - final AnnotatorCompatible walker, - final ReferenceContext ref, - final Map stratifiedContexts, - final VariantContext vc, - final Map stratifiedPerReadAlleleLikelihoodMap) { - if ( !vc.isVariant() ) - return null; - - if ( vc.hasGenotypes() ) { - final int[][] tableFromPerSampleAnnotations = getTableFromSamples( vc.getGenotypes() ); - if ( tableFromPerSampleAnnotations != null ) { - return pValueForBestTable(tableFromPerSampleAnnotations, null); - } - } - - if (vc.isSNP() && stratifiedContexts != null) { - final int[][] tableNoFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), -1); - final int[][] tableFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), MIN_QUAL_FOR_FILTERED_TEST); - printTable("unfiltered", tableNoFiltering); - printTable("filtered", tableFiltering); - return pValueForBestTable(tableFiltering, tableNoFiltering); - } - else if (stratifiedPerReadAlleleLikelihoodMap != null) { - // either SNP with no alignment context, or indels: per-read likelihood map needed - final int[][] table = getContingencyTable(stratifiedPerReadAlleleLikelihoodMap, vc); -// logger.info("VC " + vc); -// printTable(table, 0.0); - return pValueForBestTable(table, null); - } - else - // for non-snp variants, we need per-read likelihoods. - // for snps, we can get same result from simple pileup - return null; - } - - /** - * Create the FisherStrand table by retrieving the per-sample strand bias annotation and adding them together - * @param genotypes the genotypes from which to pull out the per-sample strand bias annotation - * @return the table used for the FisherStrand p-value calculation, will be null if none of the genotypes contain the per-sample SB annotation - */ - private int[][] getTableFromSamples( final GenotypesContext genotypes ) { - if( genotypes == null ) { throw new IllegalArgumentException("Genotypes cannot be null."); } - - final int[] sbArray = {0,0,0,0}; // forward-reverse -by- alternate-reference - boolean foundData = false; - - for( final Genotype g : genotypes ) { - if( g.isNoCall() || ! g.hasAnyAttribute(StrandBiasBySample.STRAND_BIAS_BY_SAMPLE_KEY_NAME) ) - continue; - - foundData = true; - final String sbbsString = (String) g.getAnyAttribute(StrandBiasBySample.STRAND_BIAS_BY_SAMPLE_KEY_NAME); - final int[] data = encodeSBBS(sbbsString); - for( int index = 0; index < sbArray.length; index++ ) { - sbArray[index] += data[index]; - } - } - - return ( foundData ? decodeSBBS(sbArray) : null ); - } - - /** - * Create an annotation for the highest (i.e., least significant) p-value of table1 and table2 - * - * @param table1 a contingency table, may be null - * @param table2 a contingency table, may be null - * @return annotation result for FS given tables - */ - private Map pValueForBestTable(final int[][] table1, final int[][] table2) { - if ( table2 == null ) - return table1 == null ? null : annotationForOneTable(pValueForContingencyTable(table1)); - else if (table1 == null) - return annotationForOneTable(pValueForContingencyTable(table2)); - else { // take the one with the best (i.e., least significant pvalue) - double pvalue1 = pValueForContingencyTable(table1); - double pvalue2 = pValueForContingencyTable(table2); - return annotationForOneTable(Math.max(pvalue1, pvalue2)); - } - } - - /** - * Returns an annotation result given a pValue - * - * @param pValue - * @return a hash map from FS -> phred-scaled pValue - */ - private Map annotationForOneTable(final double pValue) { - final Object value = String.format("%.3f", QualityUtils.phredScaleErrorRate(Math.max(pValue, MIN_PVALUE))); // prevent INFINITYs - return Collections.singletonMap(FS, value); - } - - public List getKeyNames() { - return Collections.singletonList(FS); - } - - public List getDescriptions() { - return Collections.singletonList(new VCFInfoHeaderLine(FS, 1, VCFHeaderLineType.Float, "Phred-scaled p-value using Fisher's exact test to detect strand bias")); - } - - /** - * Helper function to turn the FisherStrand table into the SB annotation array - * @param table the table used by the FisherStrand annotation - * @return the array used by the per-sample Strand Bias annotation - */ - public static int[] getContingencyArray( final int[][] table ) { - if(table.length != 2) { throw new IllegalArgumentException("Expecting a 2x2 strand bias table."); } - if(table[0].length != 2) { throw new IllegalArgumentException("Expecting a 2x2 strand bias table."); } - final int[] array = new int[4]; // TODO - if we ever want to do something clever with multi-allelic sites this will need to change - array[0] = table[0][0]; - array[1] = table[0][1]; - array[2] = table[1][0]; - array[3] = table[1][1]; - return array; - } - - /** - * Helper function to parse the genotype annotation into the SB annotation array - * @param string the string that is returned by genotype.getAnnotation("SB") - * @return the array used by the per-sample Strand Bias annotation - */ - private static int[] encodeSBBS( final String string ) { - final int[] array = new int[4]; - final StringTokenizer tokenizer = new StringTokenizer(string, ",", false); - for( int index = 0; index < 4; index++ ) { - array[index] = Integer.parseInt(tokenizer.nextToken()); - } - return array; - } - - /** - * Helper function to turn the SB annotation array into the FisherStrand table - * @param array the array used by the per-sample Strand Bias annotation - * @return the table used by the FisherStrand annotation - */ - private static int[][] decodeSBBS( final int[] array ) { - if(array.length != 4) { throw new IllegalArgumentException("Expecting a length = 4 strand bias array."); } - final int[][] table = new int[2][2]; - table[0][0] = array[0]; - table[0][1] = array[1]; - table[1][0] = array[2]; - table[1][1] = array[3]; - return table; - } - - private Double pValueForContingencyTable(int[][] originalTable) { - int [][] table = copyContingencyTable(originalTable); - - double pCutoff = computePValue(table); - //printTable(table, pCutoff); - - double pValue = pCutoff; - while (rotateTable(table)) { - double pValuePiece = computePValue(table); - - //printTable(table, pValuePiece); - - if (pValuePiece <= pCutoff) { - pValue += pValuePiece; - } - } - - table = copyContingencyTable(originalTable); - while (unrotateTable(table)) { - double pValuePiece = computePValue(table); - - //printTable(table, pValuePiece); - - if (pValuePiece <= pCutoff) { - pValue += pValuePiece; - } - } - - //System.out.printf("P-cutoff: %f\n", pCutoff); - //System.out.printf("P-value: %f\n\n", pValue); - - // min is necessary as numerical precision can result in pValue being slightly greater than 1.0 - return Math.min(pValue, 1.0); - } - - private static int [][] copyContingencyTable(int [][] t) { - int[][] c = new int[2][2]; - - for ( int i = 0; i < 2; i++ ) - for ( int j = 0; j < 2; j++ ) - c[i][j] = t[i][j]; - - return c; - } - - - private static void printTable(int[][] table, double pValue) { - logger.info(String.format("%d %d; %d %d : %f", table[0][0], table[0][1], table[1][0], table[1][1], pValue)); - } - - /** - * Printing information to logger.info for debugging purposes - * - * @param name the name of the table - * @param table the table itself - */ - private void printTable(final String name, final int[][] table) { - if ( ENABLE_DEBUGGING ) { - final String pValue = (String)annotationForOneTable(pValueForContingencyTable(table)).get(FS); - logger.info(String.format("FS %s (REF+, REF-, ALT+, ALT-) = (%d, %d, %d, %d) = %s", - name, table[0][0], table[0][1], table[1][0], table[1][1], pValue)); - } - } - - private static boolean rotateTable(int[][] table) { - table[0][0] -= 1; - table[1][0] += 1; - - table[0][1] += 1; - table[1][1] -= 1; - - return (table[0][0] >= 0 && table[1][1] >= 0); - } - - private static boolean unrotateTable(int[][] table) { - table[0][0] += 1; - table[1][0] -= 1; - - table[0][1] -= 1; - table[1][1] += 1; - - return (table[0][1] >= 0 && table[1][0] >= 0); - } - - private static double computePValue(int[][] table) { - - int[] rowSums = { sumRow(table, 0), sumRow(table, 1) }; - int[] colSums = { sumColumn(table, 0), sumColumn(table, 1) }; - int N = rowSums[0] + rowSums[1]; - - // calculate in log space so we don't die with high numbers - double pCutoff = Arithmetic.logFactorial(rowSums[0]) - + Arithmetic.logFactorial(rowSums[1]) - + Arithmetic.logFactorial(colSums[0]) - + Arithmetic.logFactorial(colSums[1]) - - Arithmetic.logFactorial(table[0][0]) - - Arithmetic.logFactorial(table[0][1]) - - Arithmetic.logFactorial(table[1][0]) - - Arithmetic.logFactorial(table[1][1]) - - Arithmetic.logFactorial(N); - return Math.exp(pCutoff); - } - - private static int sumRow(int[][] table, int column) { - int sum = 0; - for (int r = 0; r < table.length; r++) { - sum += table[r][column]; - } - - return sum; - } - - private static int sumColumn(int[][] table, int row) { - int sum = 0; - for (int c = 0; c < table[row].length; c++) { - sum += table[row][c]; - } - - return sum; - } - - /** - Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this: - * fw rc - * allele1 # # - * allele2 # # - * @return a 2x2 contingency table - */ - public static int[][] getContingencyTable( final Map stratifiedPerReadAlleleLikelihoodMap, final VariantContext vc) { - if( stratifiedPerReadAlleleLikelihoodMap == null ) { throw new IllegalArgumentException("stratifiedPerReadAlleleLikelihoodMap cannot be null"); } - if( vc == null ) { throw new IllegalArgumentException("input vc cannot be null"); } - - final Allele ref = vc.getReference(); - final Allele alt = vc.getAltAlleleWithHighestAlleleCount(); - final int[][] table = new int[2][2]; - - for (final PerReadAlleleLikelihoodMap maps : stratifiedPerReadAlleleLikelihoodMap.values() ) { - for (final Map.Entry> el : maps.getLikelihoodReadMap().entrySet()) { - final MostLikelyAllele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); - final GATKSAMRecord read = el.getKey(); - final int representativeCount = read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1; - updateTable(table, mostLikelyAllele.getAlleleIfInformative(), read, ref, alt, representativeCount); - } - } - - return table; - } - - /** - Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this: - * fw rc - * allele1 # # - * allele2 # # - * @return a 2x2 contingency table - */ - private static int[][] getSNPContingencyTable(final Map stratifiedContexts, - final Allele ref, - final Allele alt, - final int minQScoreToConsider ) { - int[][] table = new int[2][2]; - - for ( Map.Entry sample : stratifiedContexts.entrySet() ) { - for (PileupElement p : sample.getValue().getBasePileup()) { - - if ( ! isUsableBase(p) ) // ignore deletions and bad MQ - continue; - - if ( p.getQual() < minQScoreToConsider || p.getMappingQual() < minQScoreToConsider ) - continue; - - updateTable(table, Allele.create(p.getBase(), false), p.getRead(), ref, alt, p.getRepresentativeCount()); - } - } - - return table; - } - - /** - * Can the base in this pileup element be used in comparative tests? - * - * @param p the pileup element to consider - * - * @return true if this base is part of a meaningful read for comparison, false otherwise - */ - private static boolean isUsableBase(final PileupElement p) { - return !( p.isDeletion() || - p.getMappingQual() == 0 || - p.getMappingQual() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE || - ((int) p.getQual()) < QualityUtils.MIN_USABLE_Q_SCORE); - } - - private static void updateTable(final int[][] table, final Allele allele, final GATKSAMRecord read, final Allele ref, final Allele alt, final int representativeCount) { - - final boolean matchesRef = allele.equals(ref, true); - final boolean matchesAlt = allele.equals(alt, true); - - if ( matchesRef || matchesAlt ) { - final int row = matchesRef ? 0 : 1; - - if ( read.isStrandless() ) { - - // ignore strandless reduced reads because they are always on the forward strand! - if ( !read.isReducedRead() ) { - - // a strandless read counts as observations on both strand, at 50% weight, with a minimum of 1 - // (the 1 is to ensure that a strandless read always counts as an observation on both strands, even - // if the read is only seen once, because it's a merged read or other) - final int toAdd = Math.max(representativeCount / 2, 1); - table[row][0] += toAdd; - table[row][1] += toAdd; - } - } else { - // a normal read with an actual strand - final boolean isFW = !read.getReadNegativeStrandFlag(); - final int column = isFW ? 0 : 1; - table[row][column] += representativeCount; - } - } - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java deleted file mode 100644 index 906cfa021..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java +++ /dev/null @@ -1,157 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.annotator; - -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; -import org.broadinstitute.sting.gatk.walkers.coverage.DepthOfCoverage; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.vcf.VCFHeaderLineType; -import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; -import org.broadinstitute.variant.variantcontext.Genotype; -import org.broadinstitute.variant.variantcontext.GenotypesContext; -import org.broadinstitute.variant.variantcontext.VariantContext; - -import java.util.*; - -/** - * Variant confidence (from the QUAL field) / unfiltered depth of non-reference samples. Note that the QD is also normalized by event length. - * - * Low scores are indicative of false positive calls and artifacts. Note that QualByDepth requires sequencing - * reads associated with the samples with polymorphic genotypes. - */ -public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { -// private final static Logger logger = Logger.getLogger(QualByDepth.class); - - public Map annotate(final RefMetaDataTracker tracker, - final AnnotatorCompatible walker, - final ReferenceContext ref, - final Map stratifiedContexts, - final VariantContext vc, - final Map perReadAlleleLikelihoodMap ) { - if ( !vc.hasLog10PError() ) - return null; - - final GenotypesContext genotypes = vc.getGenotypes(); - if ( genotypes == null || genotypes.size() == 0 ) - return null; - - int depth = 0; - - for ( final Genotype genotype : genotypes ) { - - // we care only about variant calls with likelihoods - if ( !genotype.isHet() && !genotype.isHomVar() ) - continue; - - if (stratifiedContexts!= null && !stratifiedContexts.isEmpty()) { - final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); - if ( context == null ) - continue; - depth += context.getBasePileup().depthOfCoverage(); - - } else if (perReadAlleleLikelihoodMap != null) { - final PerReadAlleleLikelihoodMap perReadAlleleLikelihoods = perReadAlleleLikelihoodMap.get(genotype.getSampleName()); - if (perReadAlleleLikelihoods == null || perReadAlleleLikelihoods.isEmpty()) - continue; - - depth += perReadAlleleLikelihoods.getNumberOfStoredElements(); - } else if (genotype.hasDP() && vc.isBiallelic()) { // TODO -- this currently only works with biallelic variants for now because multiallelics have had their PLs stripped out and therefore their qual score can't be recomputed - depth += genotype.getDP(); - } - } - - if ( depth == 0 ) - return null; - - final double altAlleleLength = GATKVariantContextUtils.getMeanAltAlleleLength(vc); - double QD = -10.0 * vc.getLog10PError() / ((double)depth * altAlleleLength); - QD = fixTooHighQD(QD); - Map map = new HashMap<>(); - map.put(getKeyNames().get(0), String.format("%.2f", QD)); - return map; - } - - /** - * The haplotype caller generates very high quality scores when multiple events are on the - * same haplotype. This causes some very good variants to have unusually high QD values, - * and VQSR will filter these out. This code looks at the QD value, and if it is above - * threshold we map it down to the mean high QD value, with some jittering - * - * // TODO -- remove me when HaplotypeCaller bubble caller is live - * - * @param QD the raw QD score - * @return a QD value - */ - private double fixTooHighQD(final double QD) { - if ( QD < MAX_QD_BEFORE_FIXING ) { - return QD; - } else { - return IDEAL_HIGH_QD + GenomeAnalysisEngine.getRandomGenerator().nextGaussian() * JITTER_SIGMA; - } - } - - private final static double MAX_QD_BEFORE_FIXING = 35; - private final static double IDEAL_HIGH_QD = 30; - private final static double JITTER_SIGMA = 3; - - public List getKeyNames() { return Arrays.asList("QD"); } - - public List getDescriptions() { - return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Float, "Variant Confidence/Quality by Depth")); - } - - -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/StrandBiasBySample.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/StrandBiasBySample.java deleted file mode 100644 index 4b1e48a36..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/StrandBiasBySample.java +++ /dev/null @@ -1,100 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.annotator; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.variant.variantcontext.Genotype; -import org.broadinstitute.variant.variantcontext.GenotypeBuilder; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.vcf.VCFFormatHeaderLine; -import org.broadinstitute.variant.vcf.VCFHeaderLineType; - -import java.util.*; - -/** - * Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias - * User: rpoplin - * Date: 8/28/13 - */ - -public class StrandBiasBySample extends GenotypeAnnotation implements ExperimentalAnnotation { - - public final static String STRAND_BIAS_BY_SAMPLE_KEY_NAME = "SB"; - - @Override - public void annotate(final RefMetaDataTracker tracker, - final AnnotatorCompatible walker, - final ReferenceContext ref, - final AlignmentContext stratifiedContext, - final VariantContext vc, - final Genotype g, - final GenotypeBuilder gb, - final PerReadAlleleLikelihoodMap alleleLikelihoodMap) { - if ( ! isAppropriateInput(alleleLikelihoodMap, g) ) - return; - - final int[][] table = FisherStrand.getContingencyTable(Collections.singletonMap(g.getSampleName(), alleleLikelihoodMap), vc); - - gb.attribute(STRAND_BIAS_BY_SAMPLE_KEY_NAME, FisherStrand.getContingencyArray(table)); - } - - @Override - public List getKeyNames() { return Collections.singletonList(STRAND_BIAS_BY_SAMPLE_KEY_NAME); } - - @Override - public List getDescriptions() { return Collections.singletonList(new VCFFormatHeaderLine(getKeyNames().get(0), 4, VCFHeaderLineType.Integer, "Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.")); } - - private boolean isAppropriateInput(final PerReadAlleleLikelihoodMap map, final Genotype g) { - return ! (map == null || g == null || !g.isCalled()); - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationPerformance.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationPerformance.java deleted file mode 100644 index 271617059..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationPerformance.java +++ /dev/null @@ -1,141 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.bqsr; - -import org.broadinstitute.sting.commandline.*; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.filters.*; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.report.GATKReport; -import org.broadinstitute.sting.gatk.report.GATKReportTable; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.sting.utils.recalibration.*; - -import java.io.*; - -/** - * Evaluate the performance of the base recalibration process - * - *

This tool aims to evaluate the results of the Base Quality Score Recalibration (BQSR) process.

- * - *

Caveat

- *

This tool is currently experimental. We do not provide documentation nor support for its operation.

- * - */ -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) -@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class, UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class}) -@PartitionBy(PartitionType.READ) -public class RecalibrationPerformance extends RodWalker implements NanoSchedulable { - - @Output - public PrintStream out; - - @Input(fullName="recal", shortName="recal", required=false, doc="The input covariates table file") - public File RECAL_FILE = null; - - public void initialize() { - out.println("Cycle\tQrep\tQemp\tIsJoint\tObservations\tErrors"); - - final GATKReport report = new GATKReport(RECAL_FILE); - final GATKReportTable table = report.getTable(RecalUtils.ALL_COVARIATES_REPORT_TABLE_TITLE); - for ( int row = 0; row < table.getNumRows(); row++ ) { - - final int nObservations = (int)asDouble(table.get(row, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME)); - final int nErrors = (int)Math.round(asDouble(table.get(row, RecalUtils.NUMBER_ERRORS_COLUMN_NAME))); - final double empiricalQuality = asDouble(table.get(row, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME)); - - final byte QReported = Byte.parseByte((String) table.get(row, RecalUtils.QUALITY_SCORE_COLUMN_NAME)); - - final double jointEstimateQemp = RecalDatum.bayesianEstimateOfEmpiricalQuality(nObservations, nErrors, QReported); - - //if ( Math.abs((int)(jointEstimateQemp - empiricalQuality)) > 1 ) - // System.out.println(String.format("Qreported = %f, nObservations = %f, nErrors = %f, point Qemp = %f, joint Qemp = %f", estimatedQReported, nObservations, nErrors, empiricalQuality, jointEstimateQemp)); - - if ( table.get(row, RecalUtils.COVARIATE_NAME_COLUMN_NAME).equals("Cycle") && - table.get(row, RecalUtils.EVENT_TYPE_COLUMN_NAME).equals("M") && - table.get(row, RecalUtils.READGROUP_COLUMN_NAME).equals("20FUKAAXX100202.6") && - (QReported == 6 || QReported == 10 || QReported == 20 || QReported == 30 || QReported == 45) ) { - out.println(String.format("%s\t%d\t%d\t%s\t%d\t%d", table.get(row, RecalUtils.COVARIATE_VALUE_COLUMN_NAME), QReported, Math.round(empiricalQuality), "False", (int)nObservations, (int)nErrors)); - out.println(String.format("%s\t%d\t%d\t%s\t%d\t%d", table.get(row, RecalUtils.COVARIATE_VALUE_COLUMN_NAME), QReported, (int)jointEstimateQemp, "True", (int)nObservations, (int)nErrors)); - } - } - - } - - @Override - public boolean isDone() { - return true; - } - - private double asDouble(final Object o) { - if ( o instanceof Double ) - return (Double)o; - else if ( o instanceof Integer ) - return (Integer)o; - else if ( o instanceof Long ) - return (Long)o; - else - throw new ReviewedStingException("Object " + o + " is expected to be either a double, long or integer but its not either: " + o.getClass()); - } - - @Override - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { return 0; } - - @Override - public Integer reduceInit() { return 0; } - - @Override - public Integer reduce(Integer counter, Integer sum) { return 0; } - - @Override - public void onTraversalDone(Integer sum) {} -} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java deleted file mode 100644 index 5c6e9dc01..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ /dev/null @@ -1,847 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.commandline.RodBinding; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; -import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalc; -import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory; -import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcResult; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.baq.BAQ; -import org.broadinstitute.sting.utils.classloader.PluginManager; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.variant.vcf.VCFConstants; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.variant.variantcontext.*; - -import java.io.PrintStream; -import java.lang.reflect.Constructor; -import java.util.*; - -public class UnifiedGenotyperEngine { - public static final String LOW_QUAL_FILTER_NAME = "LowQual"; - private static final String GPSTRING = "GENERALPLOIDY"; - - public static final String NUMBER_OF_DISCOVERED_ALLELES_KEY = "NDA"; - public static final String PL_FOR_ALL_SNP_ALLELES_KEY = "APL"; - - public static final double HUMAN_SNP_HETEROZYGOSITY = 1e-3; - public static final double HUMAN_INDEL_HETEROZYGOSITY = 1e-4; - - private static final int SNP_MODEL = 0; - private static final int INDEL_MODEL = 1; - - public enum OUTPUT_MODE { - /** produces calls only at variant sites */ - EMIT_VARIANTS_ONLY, - /** produces calls at variant sites and confident reference sites */ - EMIT_ALL_CONFIDENT_SITES, - /** produces calls at any callable site regardless of confidence; this argument is intended only for point - * mutations (SNPs) in DISCOVERY mode or generally when running in GENOTYPE_GIVEN_ALLELES mode; it will by - * no means produce a comprehensive set of indels in DISCOVERY mode */ - EMIT_ALL_SITES - } - - // the unified argument collection - private final UnifiedArgumentCollection UAC; - public UnifiedArgumentCollection getUAC() { return UAC; } - - // the annotation engine - private final VariantAnnotatorEngine annotationEngine; - - // the model used for calculating genotypes - private ThreadLocal> glcm = new ThreadLocal>(); - private final List modelsToUse = new ArrayList(2); - - // the model used for calculating p(non-ref) - private ThreadLocal afcm = new ThreadLocal(); - - // because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything - private final double[] log10AlleleFrequencyPriorsSNPs; - private final double[] log10AlleleFrequencyPriorsIndels; - - // samples in input - private final Set samples; - - // the various loggers and writers - private final Logger logger; - private final PrintStream verboseWriter; - - // number of chromosomes (ploidy * samples) in input - private final int ploidy; - private final int N; - - // the standard filter to use for calls below the confidence threshold but above the emit threshold - private static final Set filter = new HashSet(1); - - private final GenomeLocParser genomeLocParser; - private final boolean BAQEnabledOnCMDLine; - - // --------------------------------------------------------------------------------------------------------- - // - // Public interface functions - // - // --------------------------------------------------------------------------------------------------------- - @Requires({"toolkit != null", "UAC != null"}) - public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC) { - this(toolkit, UAC, Logger.getLogger(UnifiedGenotyperEngine.class), null, null, SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()), GATKVariantContextUtils.DEFAULT_PLOIDY); - } - - protected UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, Set samples, UnifiedArgumentCollection UAC) { - this(toolkit, UAC, Logger.getLogger(UnifiedGenotyperEngine.class), null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY); - } - - @Requires({"toolkit != null", "UAC != null", "logger != null", "samples != null && samples.size() > 0","ploidy>0"}) - public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, Set samples, int ploidy) { - this.BAQEnabledOnCMDLine = toolkit.getArguments().BAQMode != BAQ.CalculationMode.OFF; - genomeLocParser = toolkit.getGenomeLocParser(); - this.samples = new TreeSet(samples); - // note that, because we cap the base quality by the mapping quality, minMQ cannot be less than minBQ - this.UAC = UAC; - - this.logger = logger; - this.verboseWriter = verboseWriter; - this.annotationEngine = engine; - - this.ploidy = ploidy; - this.N = samples.size() * ploidy; - log10AlleleFrequencyPriorsSNPs = new double[N+1]; - log10AlleleFrequencyPriorsIndels = new double[N+1]; - computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity,UAC.inputPrior); - computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY, UAC.inputPrior); - - filter.add(LOW_QUAL_FILTER_NAME); - - determineGLModelsToUse(); - - // do argument checking - if (UAC.annotateAllSitesWithPLs) { - if (!modelsToUse.contains(GenotypeLikelihoodsCalculationModel.Model.SNP)) - throw new IllegalArgumentException("Invalid genotype likelihood model specification: Only diploid SNP model can be used in conjunction with option allSitePLs"); - - } - } - - /** - * @see #calculateLikelihoodsAndGenotypes(org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker, org.broadinstitute.sting.gatk.contexts.ReferenceContext, org.broadinstitute.sting.gatk.contexts.AlignmentContext, java.util.Set) - * - * same as the full call but with allSamples == null - * - * @param tracker the meta data tracker - * @param refContext the reference base - * @param rawContext contextual information around the locus - * @return the VariantCallContext object - */ - public List calculateLikelihoodsAndGenotypes(final RefMetaDataTracker tracker, - final ReferenceContext refContext, - final AlignmentContext rawContext) { - return calculateLikelihoodsAndGenotypes(tracker, refContext, rawContext, null); - } - - - /** - * Compute full calls at a given locus. Entry point for engine calls from the UnifiedGenotyper. - * - * If allSamples != null, then the output variantCallContext is guarenteed to contain a genotype - * for every sample in allSamples. If it's null there's no such guarentee. Providing this - * argument is critical when the resulting calls will be written to a VCF file. - * - * @param tracker the meta data tracker - * @param refContext the reference base - * @param rawContext contextual information around the locus - * @param allSamples set of all sample names that we might call (i.e., those in the VCF header) - * @return the VariantCallContext object - */ - public List calculateLikelihoodsAndGenotypes(final RefMetaDataTracker tracker, - final ReferenceContext refContext, - final AlignmentContext rawContext, - final Set allSamples) { - final List results = new ArrayList(2); - - final List models = getGLModelsToUse(tracker, refContext, rawContext); - - final Map perReadAlleleLikelihoodMap = new HashMap(); - - if ( models.isEmpty() ) { - results.add(UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? generateEmptyContext(tracker, refContext, null, rawContext) : null); - } - else { - for ( final GenotypeLikelihoodsCalculationModel.Model model : models ) { - perReadAlleleLikelihoodMap.clear(); - final Map stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, model); - if ( stratifiedContexts == null ) { - results.add(UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? generateEmptyContext(tracker, refContext, null, rawContext) : null); - } - else { - final VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model, perReadAlleleLikelihoodMap); - if ( vc != null ) - results.add(calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, true, perReadAlleleLikelihoodMap)); -// todo - uncomment if we want to also emit a null ref call (with no QUAL) if there's no evidence for REF and if EMIT_ALL_SITES is set -// else if (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES) -// results.add(generateEmptyContext(tracker, refContext, null, rawContext)); - - } - } - } - - return results; - } - - /** - * Compute GLs at a given locus. Entry point for engine calls from UGCalcLikelihoods. - * - * @param tracker the meta data tracker - * @param refContext the reference base - * @param rawContext contextual information around the locus - * @param perReadAlleleLikelihoodMap Map to store per-sample, per-read, per-allele likelihoods (only used for indels) - * @return the VariantContext object - */ - public VariantContext calculateLikelihoods(final RefMetaDataTracker tracker, - final ReferenceContext refContext, - final AlignmentContext rawContext, - final Map perReadAlleleLikelihoodMap) { - final List models = getGLModelsToUse(tracker, refContext, rawContext); - if ( models.isEmpty() ) { - return null; - } - - for ( final GenotypeLikelihoodsCalculationModel.Model model : models ) { - final Map stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, model); - // return the first valid one we encounter - if ( stratifiedContexts != null ) - return calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model, perReadAlleleLikelihoodMap); - - } - - return null; - } - - /** - * Compute genotypes at a given locus. Entry point for engine calls from UGCallVariants. - * - * @param tracker the meta data tracker - * @param refContext the reference base - * @param rawContext contextual information around the locus - * @param vc the GL-annotated variant context - * @return the VariantCallContext object - */ - public VariantCallContext calculateGenotypes(final RefMetaDataTracker tracker, - final ReferenceContext refContext, - final AlignmentContext rawContext, - final VariantContext vc) { - final List models = getGLModelsToUse(tracker, refContext, rawContext); - if ( models.isEmpty() ) { - return null; - } - - // return the first one - final GenotypeLikelihoodsCalculationModel.Model model = models.get(0); - final Map stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, model); - return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, null); - } - - /** - * Compute genotypes at a given locus. - * - * @param vc the GL-annotated variant context - * @return the VariantCallContext object - */ - public VariantCallContext calculateGenotypes(VariantContext vc) { - return calculateGenotypes(null, null, null, null, vc, GenotypeLikelihoodsCalculationModel.Model.valueOf("SNP"), null); - } - - - // --------------------------------------------------------------------------------------------------------- - // - // Private implementation helpers - // - // --------------------------------------------------------------------------------------------------------- - - // private method called by both UnifiedGenotyper and UGCalcLikelihoods entry points into the engine - private VariantContext calculateLikelihoods(final RefMetaDataTracker tracker, - final ReferenceContext refContext, - final Map stratifiedContexts, - final AlignmentContextUtils.ReadOrientation type, - final List alternateAllelesToUse, - final boolean useBAQedPileup, - final GenotypeLikelihoodsCalculationModel.Model model, - final Map perReadAlleleLikelihoodMap) { - - // initialize the data for this thread if that hasn't been done yet - if ( glcm.get() == null ) { - glcm.set(getGenotypeLikelihoodsCalculationObject(logger, UAC)); - } - - return glcm.get().get(model.name()).getLikelihoods(tracker, refContext, stratifiedContexts, type, alternateAllelesToUse, useBAQedPileup && BAQEnabledOnCMDLine, genomeLocParser, perReadAlleleLikelihoodMap); - } - - private VariantCallContext generateEmptyContext(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, AlignmentContext rawContext) { - VariantContext vc; - if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { - VariantContext vcInput = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, rawContext.getLocation(), false, logger, UAC.alleles); - if ( vcInput == null ) - return null; - vc = new VariantContextBuilder("UG_call", ref.getLocus().getContig(), vcInput.getStart(), vcInput.getEnd(), vcInput.getAlleles()).make(); - } else { - // deal with bad/non-standard reference bases - if ( !Allele.acceptableAlleleBases(new byte[]{ref.getBase()}) ) - return null; - - Set alleles = new HashSet(); - alleles.add(Allele.create(ref.getBase(), true)); - vc = new VariantContextBuilder("UG_call", ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStart(), alleles).make(); - } - - if ( annotationEngine != null ) { - // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations - final ReadBackedPileup pileup = rawContext.getBasePileup(); - stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); - - vc = annotationEngine.annotateContext(tracker, ref, stratifiedContexts, vc); - } - - return new VariantCallContext(vc, false); - } - - public VariantCallContext calculateGenotypes(final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model, final Map perReadAlleleLikelihoodMap) { - return calculateGenotypes(null, null, null, null, vc, model, perReadAlleleLikelihoodMap); - } - - public VariantCallContext calculateGenotypes(final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model) { - return calculateGenotypes(null, null, null, null, vc, model, null); - } - - public VariantCallContext calculateGenotypes(final RefMetaDataTracker tracker, - final ReferenceContext refContext, - final AlignmentContext rawContext, - final Map stratifiedContexts, - final VariantContext vc, - final GenotypeLikelihoodsCalculationModel.Model model, - final Map perReadAlleleLikelihoodMap) { - return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, false, perReadAlleleLikelihoodMap); - } - - /** - * Main entry function to calculate genotypes of a given VC with corresponding GL's - * @param tracker Tracker - * @param refContext Reference context - * @param rawContext Raw context - * @param stratifiedContexts Stratified alignment contexts - * @param vc Input VC - * @param model GL calculation model - * @param inheritAttributesFromInputVC Output VC will contain attributes inherited from input vc - * @return VC with assigned genotypes - */ - public VariantCallContext calculateGenotypes(final RefMetaDataTracker tracker, final ReferenceContext refContext, - final AlignmentContext rawContext, Map stratifiedContexts, - final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model, - final boolean inheritAttributesFromInputVC, - final Map perReadAlleleLikelihoodMap) { - - boolean limitedContext = tracker == null || refContext == null || rawContext == null || stratifiedContexts == null; - - // TODO TODO TODO TODO - // REFACTOR THIS FUNCTION, TOO UNWIELDY!! - - // initialize the data for this thread if that hasn't been done yet - if ( afcm.get() == null ) { - afcm.set(AFCalcFactory.createAFCalc(UAC, N, logger)); - } - - // if input VC can't be genotyped, exit with either null VCC or, in case where we need to emit all sites, an empty call - if (!canVCbeGenotyped(vc)) { - if (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && !limitedContext) - return generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext); - else - return null; - - } - - // estimate our confidence in a reference call and return - if ( vc.getNSamples() == 0 ) { - if ( limitedContext ) - return null; - return (UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES ? - estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), false, 1.0) : - generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext)); - } - - AFCalcResult AFresult = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model)); - - // is the most likely frequency conformation AC=0 for all alternate alleles? - boolean bestGuessIsRef = true; - - // determine which alternate alleles have AF>0 - final List myAlleles = new ArrayList(vc.getAlleles().size()); - final List alleleCountsofMLE = new ArrayList(vc.getAlleles().size()); - myAlleles.add(vc.getReference()); - for ( int i = 0; i < AFresult.getAllelesUsedInGenotyping().size(); i++ ) { - final Allele alternateAllele = AFresult.getAllelesUsedInGenotyping().get(i); - if ( alternateAllele.isReference() ) - continue; - - // Compute if the site is considered polymorphic with sufficient confidence relative to our - // phred-scaled emission QUAL - final boolean isNonRef = AFresult.isPolymorphicPhredScaledQual(alternateAllele, UAC.STANDARD_CONFIDENCE_FOR_EMITTING); - - // if the most likely AC is not 0, then this is a good alternate allele to use - if ( isNonRef ) { - myAlleles.add(alternateAllele); - alleleCountsofMLE.add(AFresult.getAlleleCountAtMLE(alternateAllele)); - bestGuessIsRef = false; - } - // if in GENOTYPE_GIVEN_ALLELES mode, we still want to allow the use of a poor allele - else if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES || - UAC.annotateAllSitesWithPLs) { - myAlleles.add(alternateAllele); - alleleCountsofMLE.add(AFresult.getAlleleCountAtMLE(alternateAllele)); - } - } - - final double PoFGT0 = Math.pow(10, AFresult.getLog10PosteriorOfAFGT0()); - - // note the math.abs is necessary because -10 * 0.0 => -0.0 which isn't nice - final double phredScaledConfidence = - Math.abs(! bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES || UAC.annotateAllSitesWithPLs - ? -10 * AFresult.getLog10PosteriorOfAFEq0() - : -10 * AFresult.getLog10PosteriorOfAFGT0()); - - // return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero - if ( UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES && !passesEmitThreshold(phredScaledConfidence, bestGuessIsRef) ) { - // technically, at this point our confidence in a reference call isn't accurately estimated - // because it didn't take into account samples with no data, so let's get a better estimate - return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), true, PoFGT0); - } - - // start constructing the resulting VC - final GenomeLoc loc = genomeLocParser.createGenomeLoc(vc); - final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), myAlleles); - builder.log10PError(phredScaledConfidence/-10.0); - if ( ! passesCallThreshold(phredScaledConfidence) ) - builder.filters(filter); - - // create the genotypes - final GenotypesContext genotypes = afcm.get().subsetAlleles(vc, myAlleles, true,ploidy); - builder.genotypes(genotypes); - - // print out stats if we have a writer - if ( verboseWriter != null && !limitedContext ) - printVerboseData(refContext.getLocus().toString(), vc, PoFGT0, phredScaledConfidence, model); - - // *** note that calculating strand bias involves overwriting data structures, so we do that last - final HashMap attributes = new HashMap(); - - // inherit attributed from input vc if requested - if (inheritAttributesFromInputVC) - attributes.putAll(vc.getAttributes()); - // if the site was downsampled, record that fact - if ( !limitedContext && rawContext.hasPileupBeenDownsampled() ) - attributes.put(VCFConstants.DOWNSAMPLED_KEY, true); - - if ( UAC.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED ) - attributes.put(NUMBER_OF_DISCOVERED_ALLELES_KEY, vc.getAlternateAlleles().size()); - - // add the MLE AC and AF annotations - if ( alleleCountsofMLE.size() > 0 ) { - attributes.put(VCFConstants.MLE_ALLELE_COUNT_KEY, alleleCountsofMLE); - final int AN = builder.make().getCalledChrCount(); - final ArrayList MLEfrequencies = new ArrayList(alleleCountsofMLE.size()); - // the MLEAC is allowed to be larger than the AN (e.g. in the case of all PLs being 0, the GT is ./. but the exact model may arbitrarily choose an AC>1) - for ( int AC : alleleCountsofMLE ) - MLEfrequencies.add(Math.min(1.0, (double)AC / (double)AN)); - attributes.put(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, MLEfrequencies); - } - - if ( UAC.COMPUTE_SLOD && !limitedContext && !bestGuessIsRef ) { - //final boolean DEBUG_SLOD = false; - - // the overall lod - //double overallLog10PofNull = AFresult.log10AlleleFrequencyPosteriors[0]; - double overallLog10PofF = AFresult.getLog10LikelihoodOfAFGT0(); - //if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF); - - List allAllelesToUse = builder.make().getAlleles(); - - // the forward lod - VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, allAllelesToUse, false, model, perReadAlleleLikelihoodMap); - AFresult = afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model)); - //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); - double forwardLog10PofNull = AFresult.getLog10LikelihoodOfAFEq0(); - double forwardLog10PofF = AFresult.getLog10LikelihoodOfAFGT0(); - //if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF); - - // the reverse lod - VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, allAllelesToUse, false, model, perReadAlleleLikelihoodMap); - AFresult = afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model)); - //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); - double reverseLog10PofNull = AFresult.getLog10LikelihoodOfAFEq0(); - double reverseLog10PofF = AFresult.getLog10LikelihoodOfAFGT0(); - //if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF); - - double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF; - double reverseLod = reverseLog10PofF + forwardLog10PofNull - overallLog10PofF; - //if ( DEBUG_SLOD ) System.out.println("forward lod=" + forwardLod + ", reverse lod=" + reverseLod); - - // strand score is max bias between forward and reverse strands - double strandScore = Math.max(forwardLod, reverseLod); - // rescale by a factor of 10 - strandScore *= 10.0; - //logger.debug(String.format("SLOD=%f", strandScore)); - - if ( !Double.isNaN(strandScore) ) - attributes.put("SB", strandScore); - } - - // finish constructing the resulting VC - builder.attributes(attributes); - VariantContext vcCall = builder.make(); - - if ( annotationEngine != null && !limitedContext ) { // limitedContext callers need to handle annotations on their own by calling their own annotationEngine - // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations - final ReadBackedPileup pileup = rawContext.getBasePileup(); - stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); - - vcCall = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall, perReadAlleleLikelihoodMap); - } - - // if we are subsetting alleles (either because there were too many or because some were not polymorphic) - // then we may need to trim the alleles (because the original VariantContext may have had to pad at the end). - if ( myAlleles.size() != vc.getAlleles().size() && !limitedContext ) // limitedContext callers need to handle allele trimming on their own to keep their perReadAlleleLikelihoodMap alleles in sync - vcCall = GATKVariantContextUtils.reverseTrimAlleles(vcCall); - - return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PoFGT0)); - } - - /** - * Determine whether input VC to calculateGenotypes() can be genotyped and AF can be computed. - * @param vc Input VC - * @return Status check - */ - @Requires("vc != null") - protected boolean canVCbeGenotyped(final VariantContext vc) { - // protect against too many alternate alleles that we can't even run AF on: - if (vc.getNAlleles()> GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED) { - logger.warn("Attempting to genotype more than "+GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED + - " alleles. Site will be skipped at location "+vc.getChr()+":"+vc.getStart()); - return false; - } - else return true; - - } - - private Map getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext, final GenotypeLikelihoodsCalculationModel.Model model) { - - if ( !BaseUtils.isRegularBase(refContext.getBase()) ) - return null; - - Map stratifiedContexts = null; - - if ( model.name().contains("INDEL") ) { - - final ReadBackedPileup pileup = rawContext.getBasePileup().getMappingFilteredPileup(UAC.MIN_BASE_QUALTY_SCORE); - // don't call when there is no coverage - if ( pileup.getNumberOfElements() == 0 && UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES ) - return null; - - // stratify the AlignmentContext and cut by sample - stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); - - } else if ( model.name().contains("SNP") ) { - - // stratify the AlignmentContext and cut by sample - stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(rawContext.getBasePileup()); - - if ( !(UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) ) { - int numDeletions = 0; - for ( final PileupElement p : rawContext.getBasePileup() ) { - if ( p.isDeletion() ) - numDeletions += p.getRepresentativeCount(); - } - if ( ((double) numDeletions) / ((double) rawContext.getBasePileup().depthOfCoverage()) > UAC.MAX_DELETION_FRACTION ) { - return null; - } - } - } - - return stratifiedContexts; - } - - private final double getRefBinomialProbLog10(final int depth) { - return MathUtils.log10BinomialProbability(depth, 0); - } - - private VariantCallContext estimateReferenceConfidence(VariantContext vc, Map contexts, double theta, boolean ignoreCoveredSamples, double initialPofRef) { - if ( contexts == null ) - return null; - - double log10POfRef = Math.log10(initialPofRef); - - // for each sample that we haven't examined yet - for ( String sample : samples ) { - final AlignmentContext context = contexts.get(sample); - if ( ignoreCoveredSamples && context != null ) - continue; - final int depth = context == null ? 0 : context.getBasePileup().depthOfCoverage(); - log10POfRef += estimateLog10ReferenceConfidenceForOneSample(depth, theta); - } - - return new VariantCallContext(vc, QualityUtils.phredScaleLog10CorrectRate(log10POfRef) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING, false); - } - - /** - * Compute the log10 probability of a sample with sequencing depth and no alt allele is actually truly homozygous reference - * - * Assumes the sample is diploid - * - * @param depth the depth of the sample - * @param theta the heterozygosity of this species (between 0 and 1) - * @return a valid log10 probability of the sample being hom-ref - */ - @Requires({"depth >= 0", "theta >= 0.0 && theta <= 1.0"}) - @Ensures("MathUtils.goodLog10Probability(result)") - protected double estimateLog10ReferenceConfidenceForOneSample(final int depth, final double theta) { - final double log10PofNonRef = Math.log10(theta / 2.0) + getRefBinomialProbLog10(depth); - return MathUtils.log10OneMinusX(Math.pow(10.0, log10PofNonRef)); - } - - protected void printVerboseData(String pos, VariantContext vc, double PofF, double phredScaledConfidence, final GenotypeLikelihoodsCalculationModel.Model model) { - Allele refAllele = null, altAllele = null; - for ( Allele allele : vc.getAlleles() ) { - if ( allele.isReference() ) - refAllele = allele; - else - altAllele = allele; - } - - for (int i = 0; i <= N; i++) { - StringBuilder AFline = new StringBuilder("AFINFO\t"); - AFline.append(pos); - AFline.append("\t"); - AFline.append(refAllele); - AFline.append("\t"); - if ( altAllele != null ) - AFline.append(altAllele); - else - AFline.append("N/A"); - AFline.append("\t"); - AFline.append(i + "/" + N + "\t"); - AFline.append(String.format("%.2f\t", ((float)i)/N)); - AFline.append(String.format("%.8f\t", getAlleleFrequencyPriors(model)[i])); - verboseWriter.println(AFline.toString()); - } - - verboseWriter.println("P(f>0) = " + PofF); - verboseWriter.println("Qscore = " + phredScaledConfidence); - verboseWriter.println(); - } - - protected boolean passesEmitThreshold(double conf, boolean bestGuessIsRef) { - return (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_CONFIDENT_SITES || !bestGuessIsRef) && conf >= Math.min(UAC.STANDARD_CONFIDENCE_FOR_CALLING, UAC.STANDARD_CONFIDENCE_FOR_EMITTING); - } - - protected boolean passesCallThreshold(double conf) { - return conf >= UAC.STANDARD_CONFIDENCE_FOR_CALLING; - } - - protected boolean confidentlyCalled(double conf, double PofF) { - return conf >= UAC.STANDARD_CONFIDENCE_FOR_CALLING || - (UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES && QualityUtils.phredScaleErrorRate(PofF) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING); - } - - private void determineGLModelsToUse() { - String modelPrefix = ""; - if ( !UAC.GLmodel.name().contains(GPSTRING) && UAC.samplePloidy != GATKVariantContextUtils.DEFAULT_PLOIDY ) - modelPrefix = GPSTRING; - - // GGA mode => must initialize both the SNP and indel models - if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES || - UAC.GLmodel.name().toUpperCase().contains("BOTH") ) { - modelsToUse.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"SNP")); - modelsToUse.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"INDEL")); - } - else { - modelsToUse.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+UAC.GLmodel.name().toUpperCase())); - } - } - - // decide whether we are currently processing SNPs, indels, neither, or both - private List getGLModelsToUse(final RefMetaDataTracker tracker, - final ReferenceContext refContext, - final AlignmentContext rawContext) { - if ( UAC.GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) - return modelsToUse; - - if ( modelsToUse.size() != 2 ) - throw new IllegalStateException("GGA mode assumes that we have initialized both the SNP and indel models but found " + modelsToUse); - - // if we're genotyping given alleles then we need to choose the model corresponding to the variant type requested - final VariantContext vcInput = getVCFromAllelesRod(tracker, refContext, rawContext.getLocation(), false, logger, UAC.alleles); - - if ( vcInput == null ) { - return Collections.emptyList(); // no work to be done - } else if ( vcInput.isSNP() ) { - return Collections.singletonList(modelsToUse.get(SNP_MODEL)); - } else if ( vcInput.isIndel() || vcInput.isMixed() ) { - return Collections.singletonList(modelsToUse.get(INDEL_MODEL)); - } else { - return Collections.emptyList(); // No support for other types yet - } - } - - /** - * Function that fills vector with allele frequency priors. By default, infinite-sites, neutral variation prior is used, - * where Pr(AC=i) = theta/i where theta is heterozygosity - * @param N Number of chromosomes - * @param priors (output) array to be filled with priors - * @param heterozygosity default heterozygosity to use, if inputPriors is empty - * @param inputPriors Input priors to use (in which case heterozygosity is ignored) - */ - public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double heterozygosity, final List inputPriors) { - - - double sum = 0.0; - - if (!inputPriors.isEmpty()) { - // user-specified priors - if (inputPriors.size() != N) - throw new UserException.BadArgumentValue("inputPrior","Invalid length of inputPrior vector: vector length must be equal to # samples +1 "); - - int idx = 1; - for (final double prior: inputPriors) { - if (prior < 0.0) - throw new UserException.BadArgumentValue("Bad argument: negative values not allowed","inputPrior"); - priors[idx++] = Math.log10(prior); - sum += prior; - } - } - else { - // for each i - for (int i = 1; i <= N; i++) { - final double value = heterozygosity / (double)i; - priors[i] = Math.log10(value); - sum += value; - } - } - - // protection against the case of heterozygosity too high or an excessive number of samples (which break population genetics assumptions) - if (sum > 1.0) { - throw new UserException.BadArgumentValue("heterozygosity","The heterozygosity value is set too high relative to the number of samples to be processed, or invalid values specified if input priors were provided - try reducing heterozygosity value or correct input priors."); - } - // null frequency for AF=0 is (1 - sum(all other frequencies)) - priors[0] = Math.log10(1.0 - sum); - } - - protected double[] getAlleleFrequencyPriors( final GenotypeLikelihoodsCalculationModel.Model model ) { - if (model.name().toUpperCase().contains("SNP")) - return log10AlleleFrequencyPriorsSNPs; - else if (model.name().toUpperCase().contains("INDEL")) - return log10AlleleFrequencyPriorsIndels; - else - throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + model); - - } - - protected double getTheta( final GenotypeLikelihoodsCalculationModel.Model model ) { - if( model.name().contains("SNP") ) - return HUMAN_SNP_HETEROZYGOSITY; - if( model.name().contains("INDEL") ) - return HUMAN_INDEL_HETEROZYGOSITY; - else throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + model); - } - - private static Map getGenotypeLikelihoodsCalculationObject(Logger logger, UnifiedArgumentCollection UAC) { - - final Map glcm = new HashMap(); - final List> glmClasses = new PluginManager(GenotypeLikelihoodsCalculationModel.class).getPlugins(); - - for (int i = 0; i < glmClasses.size(); i++) { - final Class glmClass = glmClasses.get(i); - final String key = glmClass.getSimpleName().replaceAll("GenotypeLikelihoodsCalculationModel","").toUpperCase(); - try { - final Object args[] = new Object[]{UAC,logger}; - final Constructor c = glmClass.getDeclaredConstructor(UnifiedArgumentCollection.class, Logger.class); - glcm.put(key, (GenotypeLikelihoodsCalculationModel)c.newInstance(args)); - } - catch (Exception e) { - throw new UserException("The likelihoods model provided for the -glm argument (" + UAC.GLmodel + ") is not a valid option: " + e.getMessage()); - } - } - - return glcm; - } - - public static VariantContext getVCFromAllelesRod(RefMetaDataTracker tracker, ReferenceContext ref, GenomeLoc loc, boolean requireSNP, Logger logger, final RodBinding allelesBinding) { - if ( tracker == null || ref == null || logger == null ) - return null; - VariantContext vc = null; - - // search for usable record - for ( final VariantContext vc_input : tracker.getValues(allelesBinding, loc) ) { - if ( vc_input != null && ! vc_input.isFiltered() && (! requireSNP || vc_input.isSNP() )) { - if ( vc == null ) { - vc = vc_input; - } else { - logger.warn("Multiple valid VCF records detected in the alleles input file at site " + ref.getLocus() + ", only considering the first record"); - } - } - } - - return vc; - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java deleted file mode 100644 index 2ece18002..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java +++ /dev/null @@ -1,360 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; - -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.variantcontext.*; - -import java.util.*; - -public abstract class DiploidExactAFCalc extends ExactAFCalc { - public DiploidExactAFCalc(final int nSamples, final int maxAltAlleles, final int ploidy) { - super(nSamples, maxAltAlleles, ploidy); - if ( ploidy != 2 ) throw new IllegalArgumentException("ploidy must be two for DiploidExactAFCalc and subclasses but saw " + ploidy); - } - - @Override - protected AFCalcResult computeLog10PNonRef(final VariantContext vc, - final double[] log10AlleleFrequencyPriors) { - final int numAlternateAlleles = vc.getNAlleles() - 1; - final ArrayList genotypeLikelihoods = getGLs(vc.getGenotypes(), true); - final int numSamples = genotypeLikelihoods.size()-1; - final int numChr = 2*numSamples; - - // queue of AC conformations to process - final LinkedList ACqueue = new LinkedList(); - - // mapping of ExactACset indexes to the objects - final HashMap indexesToACset = new HashMap(numChr+1); - - // add AC=0 to the queue - final int[] zeroCounts = new int[numAlternateAlleles]; - ExactACset zeroSet = new ExactACset(numSamples+1, new ExactACcounts(zeroCounts)); - ACqueue.add(zeroSet); - indexesToACset.put(zeroSet.getACcounts(), zeroSet); - - while ( !ACqueue.isEmpty() ) { - getStateTracker().incNEvaluations(); // keep track of the number of evaluations - - // compute log10Likelihoods - final ExactACset set = ACqueue.remove(); - - final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors); - - // clean up memory - indexesToACset.remove(set.getACcounts()); - //if ( DEBUG ) - // System.out.printf(" *** removing used set=%s%n", set.ACcounts); - } - - return getResultFromFinalState(vc, log10AlleleFrequencyPriors); - } - - @Override - protected VariantContext reduceScope(final VariantContext vc) { - // don't try to genotype too many alternate alleles - if ( vc.getAlternateAlleles().size() > getMaxAltAlleles() ) { - logger.warn("this tool is currently set to genotype at most " + getMaxAltAlleles() + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument"); - - VariantContextBuilder builder = new VariantContextBuilder(vc); - List alleles = new ArrayList(getMaxAltAlleles() + 1); - alleles.add(vc.getReference()); - alleles.addAll(chooseMostLikelyAlternateAlleles(vc, getMaxAltAlleles())); - builder.alleles(alleles); - builder.genotypes(GATKVariantContextUtils.subsetDiploidAlleles(vc, alleles, GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL)); - return builder.make(); - } else { - return vc; - } - } - - private static final int PL_INDEX_OF_HOM_REF = 0; - private static List chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose) { - final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); - final LikelihoodSum[] likelihoodSums = new LikelihoodSum[numOriginalAltAlleles]; - for ( int i = 0; i < numOriginalAltAlleles; i++ ) - likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i)); - - // based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype - final ArrayList GLs = getGLs(vc.getGenotypes(), true); - for ( final double[] likelihoods : GLs ) { - final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods); - if ( PLindexOfBestGL != PL_INDEX_OF_HOM_REF ) { - GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindexOfBestGL); - if ( alleles.alleleIndex1 != 0 ) - likelihoodSums[alleles.alleleIndex1-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]; - // don't double-count it - if ( alleles.alleleIndex2 != 0 && alleles.alleleIndex2 != alleles.alleleIndex1 ) - likelihoodSums[alleles.alleleIndex2-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]; - } - } - - // sort them by probability mass and choose the best ones - Collections.sort(Arrays.asList(likelihoodSums)); - final ArrayList bestAlleles = new ArrayList(numAllelesToChoose); - for ( int i = 0; i < numAllelesToChoose; i++ ) - bestAlleles.add(likelihoodSums[i].allele); - - final ArrayList orderedBestAlleles = new ArrayList(numAllelesToChoose); - for ( Allele allele : vc.getAlternateAlleles() ) { - if ( bestAlleles.contains(allele) ) - orderedBestAlleles.add(allele); - } - - return orderedBestAlleles; - } - - private static final class DependentSet { - public final int[] ACcounts; - public final int PLindex; - - public DependentSet(final int[] ACcounts, final int PLindex) { - this.ACcounts = ACcounts; - this.PLindex = PLindex; - } - } - - private double calculateAlleleCountConformation(final ExactACset set, - final ArrayList genotypeLikelihoods, - final int numChr, - final LinkedList ACqueue, - final HashMap indexesToACset, - final double[] log10AlleleFrequencyPriors) { - - //if ( DEBUG ) - // System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts); - - // compute the log10Likelihoods - computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors); - - final double log10LofK = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1]; - - // can we abort early because the log10Likelihoods are so small? - if ( getStateTracker().abort(log10LofK, set.getACcounts(), true) ) { - //if ( DEBUG ) - // System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L); - return log10LofK; - } - - // iterate over higher frequencies if possible - final int ACwiggle = numChr - set.getACsum(); - if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies - return log10LofK; - - final int numAltAlleles = set.getACcounts().getCounts().length; - - // add conformations for the k+1 case - for ( int allele = 0; allele < numAltAlleles; allele++ ) { - final int[] ACcountsClone = set.getACcounts().getCounts().clone(); - ACcountsClone[allele]++; - // to get to this conformation, a sample would need to be AB (remember that ref=0) - final int PLindex = GenotypeLikelihoods.calculatePLindex(0, allele+1); - updateACset(ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods); - } - - // add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different - if ( ACwiggle > 1 ) { - final ArrayList differentAlleles = new ArrayList(numAltAlleles * numAltAlleles); - final ArrayList sameAlleles = new ArrayList(numAltAlleles); - - for ( int allele_i = 0; allele_i < numAltAlleles; allele_i++ ) { - for ( int allele_j = allele_i; allele_j < numAltAlleles; allele_j++ ) { - final int[] ACcountsClone = set.getACcounts().getCounts().clone(); - ACcountsClone[allele_i]++; - ACcountsClone[allele_j]++; - - // to get to this conformation, a sample would need to be BB or BC (remember that ref=0, so add one to the index) - final int PLindex = GenotypeLikelihoods.calculatePLindex(allele_i+1, allele_j+1); - if ( allele_i == allele_j ) - sameAlleles.add(new DependentSet(ACcountsClone, PLindex)); - else - differentAlleles.add(new DependentSet(ACcountsClone, PLindex)); - } - } - - // IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering - for ( DependentSet dependent : differentAlleles ) - updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); - for ( DependentSet dependent : sameAlleles ) - updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); - } - - return log10LofK; - } - - // adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and - // also pushes its value to the given callingSetIndex. - private void updateACset(final int[] newSetCounts, - final int numChr, - final ExactACset dependentSet, - final int PLsetIndex, - final Queue ACqueue, - final HashMap indexesToACset, - final ArrayList genotypeLikelihoods) { - final ExactACcounts index = new ExactACcounts(newSetCounts); - if ( !indexesToACset.containsKey(index) ) { - ExactACset set = new ExactACset(numChr/2 +1, index); - indexesToACset.put(index, set); - ACqueue.add(set); - } - - // push data from the dependency to the new set - //if ( DEBUG ) - // System.out.println(" *** pushing data from " + index + " to " + dependencySet.ACcounts); - pushData(indexesToACset.get(index), dependentSet, PLsetIndex, genotypeLikelihoods); - } - - private void computeLofK(final ExactACset set, - final ArrayList genotypeLikelihoods, - final double[] log10AlleleFrequencyPriors) { - - set.getLog10Likelihoods()[0] = 0.0; // the zero case - final int totalK = set.getACsum(); - - // special case for k = 0 over all k - if ( totalK == 0 ) { - for ( int j = 1; j < set.getLog10Likelihoods().length; j++ ) - set.getLog10Likelihoods()[j] = set.getLog10Likelihoods()[j-1] + genotypeLikelihoods.get(j)[HOM_REF_INDEX]; - - final double log10Lof0 = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1]; - getStateTracker().setLog10LikelihoodOfAFzero(log10Lof0); - getStateTracker().setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); - return; - } - - // if we got here, then k > 0 for at least one k. - // the non-AA possible conformations were already dealt with by pushes from dependent sets; - // now deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value - for ( int j = 1; j < set.getLog10Likelihoods().length; j++ ) { - - if ( totalK < 2*j-1 ) { - final double[] gl = genotypeLikelihoods.get(j); - final double conformationValue = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.getLog10Likelihoods()[j-1] + gl[HOM_REF_INDEX]; - set.getLog10Likelihoods()[j] = MathUtils.approximateLog10SumLog10(set.getLog10Likelihoods()[j], conformationValue); - } - - final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; - set.getLog10Likelihoods()[j] = set.getLog10Likelihoods()[j] - logDenominator; - } - - double log10LofK = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1]; - - // update the MLE if necessary - getStateTracker().updateMLEifNeeded(log10LofK, set.getACcounts().getCounts()); - - // apply the priors over each alternate allele - for ( final int ACcount : set.getACcounts().getCounts() ) { - if ( ACcount > 0 ) - log10LofK += log10AlleleFrequencyPriors[ACcount]; - } - - getStateTracker().updateMAPifNeeded(log10LofK, set.getACcounts().getCounts()); - } - - private void pushData(final ExactACset targetSet, - final ExactACset dependentSet, - final int PLsetIndex, - final ArrayList genotypeLikelihoods) { - final int totalK = targetSet.getACsum(); - - for ( int j = 1; j < targetSet.getLog10Likelihoods().length; j++ ) { - - if ( totalK <= 2*j ) { // skip impossible conformations - final double[] gl = genotypeLikelihoods.get(j); - final double conformationValue = - determineCoefficient(PLsetIndex, j, targetSet.getACcounts().getCounts(), totalK) + dependentSet.getLog10Likelihoods()[j-1] + gl[PLsetIndex]; - targetSet.getLog10Likelihoods()[j] = MathUtils.approximateLog10SumLog10(targetSet.getLog10Likelihoods()[j], conformationValue); - } - } - } - - private double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) { - // the closed form representation generalized for multiple alleles is as follows: - // AA: (2j - totalK) * (2j - totalK - 1) - // AB: 2k_b * (2j - totalK) - // AC: 2k_c * (2j - totalK) - // BB: k_b * (k_b - 1) - // BC: 2 * k_b * k_c - // CC: k_c * (k_c - 1) - - // find the 2 alleles that are represented by this PL index - GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); - - // *** note that throughout this method we subtract one from the alleleIndex because ACcounts *** - // *** doesn't consider the reference allele whereas the GenotypeLikelihoods PL cache does. *** - - // the AX het case - if ( alleles.alleleIndex1 == 0 ) - return MathUtils.log10Cache[2*ACcounts[alleles.alleleIndex2-1]] + MathUtils.log10Cache[2*j-totalK]; - - final int k_i = ACcounts[alleles.alleleIndex1-1]; - - // the hom var case (e.g. BB, CC, DD) - final double coeff; - if ( alleles.alleleIndex1 == alleles.alleleIndex2 ) { - coeff = MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_i - 1]; - } - // the het non-ref case (e.g. BC, BD, CD) - else { - final int k_j = ACcounts[alleles.alleleIndex2-1]; - coeff = MathUtils.log10Cache[2] + MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_j]; - } - - return coeff; - } - - public GenotypesContext subsetAlleles(final VariantContext vc, - final List allelesToUse, - final boolean assignGenotypes, - final int ploidy) { - return allelesToUse.size() == 1 - ? GATKVariantContextUtils.subsetToRefOnly(vc, ploidy) - : GATKVariantContextUtils.subsetDiploidAlleles(vc, allelesToUse, - assignGenotypes ? GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN : GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL); - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalc.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalc.java deleted file mode 100644 index 3d28db159..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalc.java +++ /dev/null @@ -1,102 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; - -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.variantcontext.Allele; -import org.broadinstitute.variant.variantcontext.Genotype; -import org.broadinstitute.variant.variantcontext.GenotypesContext; - -import java.util.ArrayList; - -/** - * Uses the Exact calculation of Heng Li - */ -abstract class ExactAFCalc extends AFCalc { - protected static final int HOM_REF_INDEX = 0; // AA likelihoods are always first - - protected ExactAFCalc(final int nSamples, int maxAltAlleles, final int ploidy) { - super(nSamples, maxAltAlleles, ploidy); - } - - /** - * Wrapper class that compares two likelihoods associated with two alleles - */ - protected static final class LikelihoodSum implements Comparable { - public double sum = 0.0; - public Allele allele; - - public LikelihoodSum(Allele allele) { this.allele = allele; } - - public int compareTo(LikelihoodSum other) { - final double diff = sum - other.sum; - return ( diff < 0.0 ) ? 1 : (diff > 0.0 ) ? -1 : 0; - } - } - - /** - * Unpack GenotypesContext into arraylist of doubel values - * @param GLs Input genotype context - * @return ArrayList of doubles corresponding to GL vectors - */ - protected static ArrayList getGLs(final GenotypesContext GLs, final boolean includeDummy) { - ArrayList genotypeLikelihoods = new ArrayList(GLs.size() + 1); - - if ( includeDummy ) genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy - for ( Genotype sample : GLs.iterateInSampleNameOrder() ) { - if ( sample.hasLikelihoods() ) { - double[] gls = sample.getLikelihoods().getAsVector(); - - if ( MathUtils.sum(gls) < GATKVariantContextUtils.SUM_GL_THRESH_NOCALL ) - genotypeLikelihoods.add(gls); - } - } - - return genotypeLikelihoods; - } -} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java deleted file mode 100644 index f8c364e82..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java +++ /dev/null @@ -1,636 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; - -import org.broadinstitute.sting.gatk.walkers.genotyper.GeneralPloidyGenotypeLikelihoods; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.vcf.VCFConstants; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.variant.variantcontext.*; - -import java.util.*; - -public class GeneralPloidyExactAFCalc extends ExactAFCalc { - static final int MAX_LENGTH_FOR_POOL_PL_LOGGING = 10; // if PL vectors longer than this # of elements, don't log them - - private final int ploidy; - private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 - private final static boolean VERBOSE = false; - - protected GeneralPloidyExactAFCalc(final int nSamples, final int maxAltAlleles, final int ploidy) { - super(nSamples, maxAltAlleles, ploidy); - this.ploidy = ploidy; - } - - @Override - protected VariantContext reduceScope(VariantContext vc) { - // don't try to genotype too many alternate alleles - if ( vc.getAlternateAlleles().size() > getMaxAltAlleles()) { - logger.warn("this tool is currently set to genotype at most " + getMaxAltAlleles() + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument"); - - final List alleles = new ArrayList(getMaxAltAlleles() + 1); - alleles.add(vc.getReference()); - alleles.addAll(chooseMostLikelyAlternateAlleles(vc, getMaxAltAlleles(), ploidy)); - - VariantContextBuilder builder = new VariantContextBuilder(vc); - builder.alleles(alleles); - builder.genotypes(subsetAlleles(vc, alleles, false, ploidy)); - return builder.make(); - } else { - return vc; - } - } - - @Override - public AFCalcResult computeLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors) { - combineSinglePools(vc.getGenotypes(), vc.getNAlleles(), ploidy, log10AlleleFrequencyPriors); - return getResultFromFinalState(vc, log10AlleleFrequencyPriors); - } - - /** - * Simple wrapper class to hold values of combined pool likelihoods. - * For fast hashing and fast retrieval, there's a hash map that shadows main list. - * - */ - static class CombinedPoolLikelihoods { - private LinkedList alleleCountSetList; - private HashMap conformationMap; - private double maxLikelihood; - - - public CombinedPoolLikelihoods() { - // final int numElements = GenotypeLikelihoods.numLikelihoods(); - alleleCountSetList = new LinkedList(); - conformationMap = new HashMap(); - maxLikelihood = Double.NEGATIVE_INFINITY; - } - - public void add(ExactACset set) { - alleleCountSetList.add(set); - conformationMap.put(set.getACcounts(), set); - final double likelihood = set.getLog10Likelihoods()[0]; - - if (likelihood > maxLikelihood ) - maxLikelihood = likelihood; - - } - - public boolean hasConformation(int[] ac) { - return conformationMap.containsKey(new ExactACcounts(ac)); - - } - - public double getLikelihoodOfConformation(int[] ac) { - return conformationMap.get(new ExactACcounts(ac)).getLog10Likelihoods()[0]; - } - - public double getGLOfACZero() { - return alleleCountSetList.get(0).getLog10Likelihoods()[0]; // AC 0 is always at beginning of list - } - - public int getLength() { - return alleleCountSetList.size(); - } - } - - /** - * - * Chooses N most likely alleles in a set of pools (samples) based on GL sum over alt alleles - * @param vc Input variant context - * @param numAllelesToChoose Number of alleles to choose - * @param ploidy Ploidy per pool - * @return list of numAllelesToChoose most likely alleles - */ - - private static final int PL_INDEX_OF_HOM_REF = 0; - private static List chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose, int ploidy) { - final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); - final LikelihoodSum[] likelihoodSums = new LikelihoodSum[numOriginalAltAlleles]; - for ( int i = 0; i < numOriginalAltAlleles; i++ ) - likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i)); - - // based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype - final ArrayList GLs = getGLs(vc.getGenotypes(), false); - for ( final double[] likelihoods : GLs ) { - - final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods); - final int[] acCount = GeneralPloidyGenotypeLikelihoods.getAlleleCountFromPLIndex(1 + numOriginalAltAlleles, ploidy, PLindexOfBestGL); - // by convention, first count coming from getAlleleCountFromPLIndex comes from reference allele - for (int k=1; k < acCount.length;k++) { - if (acCount[k] > 0) - likelihoodSums[k-1].sum += acCount[k] * (likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]); - - } - } - - // sort them by probability mass and choose the best ones - Collections.sort(Arrays.asList(likelihoodSums)); - final ArrayList bestAlleles = new ArrayList(numAllelesToChoose); - for ( int i = 0; i < numAllelesToChoose; i++ ) - bestAlleles.add(likelihoodSums[i].allele); - - final ArrayList orderedBestAlleles = new ArrayList(numAllelesToChoose); - for ( Allele allele : vc.getAlternateAlleles() ) { - if ( bestAlleles.contains(allele) ) - orderedBestAlleles.add(allele); - } - - return orderedBestAlleles; - } - - - /** - * Simple non-optimized version that combines GLs from several pools and produces global AF distribution. - * @param GLs Inputs genotypes context with per-pool GLs - * @param numAlleles Number of alternate alleles - * @param ploidyPerPool Number of samples per pool - * @param log10AlleleFrequencyPriors Frequency priors - */ - protected void combineSinglePools(final GenotypesContext GLs, - final int numAlleles, - final int ploidyPerPool, - final double[] log10AlleleFrequencyPriors) { - - final ArrayList genotypeLikelihoods = getGLs(GLs, true); - - - int combinedPloidy = 0; - - // Combine each pool incrementally - likelihoods will be renormalized at each step - CombinedPoolLikelihoods combinedPoolLikelihoods = new CombinedPoolLikelihoods(); - - // first element: zero ploidy, e.g. trivial degenerate distribution - final int[] zeroCounts = new int[numAlleles]; - final ExactACset set = new ExactACset(1, new ExactACcounts(zeroCounts)); - set.getLog10Likelihoods()[0] = 0.0; - - combinedPoolLikelihoods.add(set); - - if ( genotypeLikelihoods.size() <= 1 ) { - // no meaningful GLs at all, just set the tracker to non poly values - getStateTracker().reset(); // just mimic-ing call below - getStateTracker().setLog10LikelihoodOfAFzero(0.0); - } else { - for (int p=1; p ACqueue = new LinkedList(); - // mapping of ExactACset indexes to the objects - final HashMap indexesToACset = new HashMap(); - final CombinedPoolLikelihoods newPool = new CombinedPoolLikelihoods(); - - // add AC=0 to the queue - final int[] zeroCounts = new int[numAlleles]; - final int newPloidy = originalPloidy + newGLPloidy; - zeroCounts[0] = newPloidy; - - ExactACset zeroSet = new ExactACset(1, new ExactACcounts(zeroCounts)); - - ACqueue.add(zeroSet); - indexesToACset.put(zeroSet.getACcounts(), zeroSet); - - // keep processing while we have AC conformations that need to be calculated - while ( !ACqueue.isEmpty() ) { - getStateTracker().incNEvaluations(); - // compute log10Likelihoods - final ExactACset ACset = ACqueue.remove(); - final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, ACqueue, indexesToACset); - - // clean up memory - indexesToACset.remove(ACset.getACcounts()); - if ( VERBOSE ) - System.out.printf(" *** removing used set=%s%n", ACset.getACcounts()); - - } - return newPool; - } - - // todo - refactor, function almost identical except for log10LofK computation in GeneralPloidyGenotypeLikelihoods - /** - * - * @param set ExactACset holding conformation to be computed - * @param newPool New pool likelihood holder - * @param originalPool Original likelihood holder - * @param newGL New pool GL vector to combine - * @param log10AlleleFrequencyPriors Prior object - * @param originalPloidy Total ploidy of original combined pool - * @param newGLPloidy Ploidy of GL vector - * @param ACqueue Queue of conformations to compute - * @param indexesToACset AC indices of objects in queue - * @return max log likelihood - */ - private double calculateACConformationAndUpdateQueue(final ExactACset set, - final CombinedPoolLikelihoods newPool, - final CombinedPoolLikelihoods originalPool, - final double[] newGL, - final double[] log10AlleleFrequencyPriors, - final int originalPloidy, - final int newGLPloidy, - final LinkedList ACqueue, - final HashMap indexesToACset) { - - // compute likeihood in "set" of new set based on original likelihoods - final int numAlleles = set.getACcounts().getCounts().length; - final int newPloidy = set.getACsum(); - final double log10LofK = computeLofK(set, originalPool, newGL, log10AlleleFrequencyPriors, numAlleles, originalPloidy, newGLPloidy); - - - // add to new pool - if (!Double.isInfinite(log10LofK)) - newPool.add(set); - - // TODO -- change false to true this correct line when the implementation of this model is optimized (it's too slow now to handle this fix) - if ( getStateTracker().abort(log10LofK, set.getACcounts(), false) ) { - return log10LofK; - } - - // iterate over higher frequencies if possible - // by convention, ACcounts contained in set have full vector of possible pool ac counts including ref count. - // so, if first element is zero, it automatically means we have no wiggle since we're in a corner of the conformation space - final int ACwiggle = set.getACcounts().getCounts()[0]; - if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies - return log10LofK; - - - // add conformations for other cases - for ( int allele = 1; allele < numAlleles; allele++ ) { - final int[] ACcountsClone = set.getACcounts().getCounts().clone(); - ACcountsClone[allele]++; - // is this a valid conformation? - int altSum = (int)MathUtils.sum(ACcountsClone) - ACcountsClone[0]; - ACcountsClone[0] = newPloidy - altSum; - if (ACcountsClone[0] < 0) - continue; - - - GeneralPloidyGenotypeLikelihoods.updateACset(ACcountsClone, ACqueue, indexesToACset); - } - - - return log10LofK; - } - - -// /** -// * Naive combiner of two multiallelic pools - number of alt alleles must be the same. -// * Math is generalization of biallelic combiner. -// * -// * For vector K representing an allele count conformation, -// * Pr(D | AC = K) = Sum_G Pr(D|AC1 = G) Pr (D|AC2=K-G) * F(G,K) -// * where F(G,K) = choose(m1,[g0 g1 ...])*choose(m2,[...]) / choose(m1+m2,[k1 k2 ...]) -// * @param originalPool First log-likelihood pool GL vector -// * @param yy Second pool GL vector -// * @param ploidy1 Ploidy of first pool (# of chromosomes in it) -// * @param ploidy2 Ploidy of second pool -// * @param numAlleles Number of alleles -// * @param log10AlleleFrequencyPriors Array of biallelic priors -// * @param resultTracker Af calculation result object -// */ -// public static void combineMultiallelicPoolNaively(CombinedPoolLikelihoods originalPool, double[] yy, int ploidy1, int ploidy2, int numAlleles, -// final double[] log10AlleleFrequencyPriors, -// final AFCalcResultTracker resultTracker) { -///* -// final int dim1 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy1); -// final int dim2 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy2); -// -// if (dim1 != originalPool.getLength() || dim2 != yy.length) -// throw new ReviewedStingException("BUG: Inconsistent vector length"); -// -// if (ploidy2 == 0) -// return; -// -// final int newPloidy = ploidy1 + ploidy2; -// -// // Say L1(K) = Pr(D|AC1=K) * choose(m1,K) -// // and L2(K) = Pr(D|AC2=K) * choose(m2,K) -// GeneralPloidyGenotypeLikelihoods.SumIterator firstIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy1); -// final double[] x = originalPool.getLikelihoodsAsVector(true); -// while(firstIterator.hasNext()) { -// x[firstIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy1,firstIterator.getCurrentVector()); -// firstIterator.next(); -// } -// -// GeneralPloidyGenotypeLikelihoods.SumIterator secondIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2); -// final double[] y = yy.clone(); -// while(secondIterator.hasNext()) { -// y[secondIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy2,secondIterator.getCurrentVector()); -// secondIterator.next(); -// } -// -// // initialize output to -log10(choose(m1+m2,[k1 k2...]) -// final int outputDim = GenotypeLikelihoods.numLikelihoods(numAlleles, newPloidy); -// final GeneralPloidyGenotypeLikelihoods.SumIterator outputIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,newPloidy); -// -// -// // Now, result(K) = logSum_G (L1(G)+L2(K-G)) where G are all possible vectors that sum UP to K -// while(outputIterator.hasNext()) { -// final ExactACset set = new ExactACset(1, new ExactACcounts(outputIterator.getCurrentAltVector())); -// double likelihood = computeLofK(set, x,y, log10AlleleFrequencyPriors, numAlleles, ploidy1, ploidy2, result); -// -// originalPool.add(likelihood, set, outputIterator.getLinearIndex()); -// outputIterator.next(); -// } -//*/ -// } - - /** - * Compute likelihood of a particular AC conformation and update AFresult object - * @param set Set of AC counts to compute - * @param firstGLs Original pool likelihoods before combining - * @param secondGL New GL vector with additional pool - * @param log10AlleleFrequencyPriors Allele frequency priors - * @param numAlleles Number of alleles (including ref) - * @param ploidy1 Ploidy of original pool (combined) - * @param ploidy2 Ploidy of new pool - * @return log-likehood of requested conformation - */ - private double computeLofK(final ExactACset set, - final CombinedPoolLikelihoods firstGLs, - final double[] secondGL, - final double[] log10AlleleFrequencyPriors, - final int numAlleles, final int ploidy1, final int ploidy2) { - - final int newPloidy = ploidy1 + ploidy2; - - // sanity check - int totalAltK = set.getACsum(); - if (newPloidy != totalAltK) - throw new ReviewedStingException("BUG: inconsistent sizes of set.getACsum and passed ploidy values"); - - totalAltK -= set.getACcounts().getCounts()[0]; - // totalAltK has sum of alt alleles of conformation now - - - // special case for k = 0 over all k - if ( totalAltK == 0 ) { // all-ref case - final double log10Lof0 = firstGLs.getGLOfACZero() + secondGL[HOM_REF_INDEX]; - set.getLog10Likelihoods()[0] = log10Lof0; - - getStateTracker().setLog10LikelihoodOfAFzero(log10Lof0); - getStateTracker().setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); - return log10Lof0; - - } else { - - // initialize result with denominator - // ExactACset holds by convention the conformation of all alleles, and the sum of all allele count is just the ploidy. - // To compute n!/k1!k2!k3!... we need to compute first n!/(k2!k3!...) and then further divide by k1! where k1=ploidy-sum_k_i - - int[] currentCount = set.getACcounts().getCounts(); - double denom = -MathUtils.log10MultinomialCoefficient(newPloidy, currentCount); - - // for current conformation, get all possible ways to break vector K into two components G1 and G2 - final GeneralPloidyGenotypeLikelihoods.SumIterator innerIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2); - set.getLog10Likelihoods()[0] = Double.NEGATIVE_INFINITY; - while (innerIterator.hasNext()) { - // check if breaking current conformation into g1 and g2 is feasible. - final int[] acCount2 = innerIterator.getCurrentVector(); - final int[] acCount1 = MathUtils.vectorDiff(currentCount, acCount2); - final int idx2 = innerIterator.getLinearIndex(); - // see if conformation is valid and if original pool had this conformation - // for conformation to be valid, all elements of g2 have to be <= elements of current AC set - if (isValidConformation(acCount1,ploidy1) && firstGLs.hasConformation(acCount1)) { - final double gl2 = secondGL[idx2]; - if (!Double.isInfinite(gl2)) { - final double firstGL = firstGLs.getLikelihoodOfConformation(acCount1); - final double num1 = MathUtils.log10MultinomialCoefficient(ploidy1, acCount1); - final double num2 = MathUtils.log10MultinomialCoefficient(ploidy2, acCount2); - final double sum = firstGL + gl2 + num1 + num2; - - set.getLog10Likelihoods()[0] = MathUtils.approximateLog10SumLog10(set.getLog10Likelihoods()[0], sum); - } - } - innerIterator.next(); - } - - set.getLog10Likelihoods()[0] += denom; - } - - double log10LofK = set.getLog10Likelihoods()[0]; - - // update the MLE if necessary - final int altCounts[] = Arrays.copyOfRange(set.getACcounts().getCounts(),1, set.getACcounts().getCounts().length); - // TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY - getStateTracker().updateMLEifNeeded(Math.max(log10LofK, -Double.MAX_VALUE), altCounts); - - // apply the priors over each alternate allele - for (final int ACcount : altCounts ) { - if ( ACcount > 0 ) - log10LofK += log10AlleleFrequencyPriors[ACcount]; - } - // TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY - getStateTracker().updateMAPifNeeded(Math.max(log10LofK, -Double.MAX_VALUE), altCounts); - - return log10LofK; - } - - /** - * Small helper routine - is a particular AC conformationv vector valid? ie are all elements non-negative and sum to ploidy? - * @param set AC conformation vector - * @param ploidy Ploidy of set - * @return Valid conformation - */ - private static boolean isValidConformation(final int[] set, final int ploidy) { - int sum=0; - for (final int ac: set) { - if (ac < 0) - return false; - sum += ac; - - } - - return (sum == ploidy); - } - - /** - * From a given variant context, extract a given subset of alleles, and update genotype context accordingly, - * including updating the PL's, and assign genotypes accordingly - * @param vc variant context with alleles and genotype likelihoods - * @param allelesToUse alleles to subset - * @param assignGenotypes true: assign hard genotypes, false: leave as no-call - * @param ploidy number of chromosomes per sample (pool) - * @return GenotypesContext with new PLs - */ - public GenotypesContext subsetAlleles(final VariantContext vc, - final List allelesToUse, - final boolean assignGenotypes, - final int ploidy) { - // the genotypes with PLs - final GenotypesContext oldGTs = vc.getGenotypes(); - List NO_CALL_ALLELES = new ArrayList(ploidy); - - for (int k=0; k < ploidy; k++) - NO_CALL_ALLELES.add(Allele.NO_CALL); - - // samples - final List sampleIndices = oldGTs.getSampleNamesOrderedByName(); - - // the new genotypes to create - final GenotypesContext newGTs = GenotypesContext.create(); - - // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward - final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); - final int numNewAltAlleles = allelesToUse.size() - 1; - - - // create the new genotypes - for ( int k = 0; k < oldGTs.size(); k++ ) { - final Genotype g = oldGTs.get(sampleIndices.get(k)); - if ( !g.hasLikelihoods() ) { - newGTs.add(GenotypeBuilder.create(g.getSampleName(), NO_CALL_ALLELES)); - continue; - } - - // create the new likelihoods array from the alleles we are allowed to use - final double[] originalLikelihoods = g.getLikelihoods().getAsVector(); - double[] newLikelihoods; - - // Optimization: if # of new alt alleles = 0 (pure ref call), keep original likelihoods so we skip normalization - // and subsetting - if ( numOriginalAltAlleles == numNewAltAlleles || numNewAltAlleles == 0) { - newLikelihoods = originalLikelihoods; - } else { - newLikelihoods = GeneralPloidyGenotypeLikelihoods.subsetToAlleles(originalLikelihoods, ploidy, vc.getAlleles(), allelesToUse); - - // might need to re-normalize - newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true); - } - - // if there is no mass on the (new) likelihoods, then just no-call the sample - if ( MathUtils.sum(newLikelihoods) > GATKVariantContextUtils.SUM_GL_THRESH_NOCALL ) { - newGTs.add(GenotypeBuilder.create(g.getSampleName(), NO_CALL_ALLELES)); - } - else { - final GenotypeBuilder gb = new GenotypeBuilder(g); - - if ( numNewAltAlleles == 0 ) - gb.noPL(); - else - gb.PL(newLikelihoods); - - // if we weren't asked to assign a genotype, then just no-call the sample - if ( !assignGenotypes || MathUtils.sum(newLikelihoods) > GATKVariantContextUtils.SUM_GL_THRESH_NOCALL ) - gb.alleles(NO_CALL_ALLELES); - else - assignGenotype(gb, newLikelihoods, allelesToUse, ploidy); - newGTs.add(gb.make()); - } - } - - return newGTs; - - } - - /** - * Assign genotypes (GTs) to the samples in the Variant Context greedily based on the PLs - * - * @param newLikelihoods the PL array - * @param allelesToUse the list of alleles to choose from (corresponding to the PLs) - * @param numChromosomes Number of chromosomes per pool - * - * @return genotype - */ - private void assignGenotype(final GenotypeBuilder gb, - final double[] newLikelihoods, - final List allelesToUse, - final int numChromosomes) { - final int numNewAltAlleles = allelesToUse.size() - 1; - - - - // find the genotype with maximum likelihoods - final int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods); - - final int[] mlAlleleCount = GeneralPloidyGenotypeLikelihoods.getAlleleCountFromPLIndex(allelesToUse.size(), numChromosomes, PLindex); - final ArrayList alleleFreqs = new ArrayList(); - final ArrayList alleleCounts = new ArrayList(); - - - for (int k=1; k < mlAlleleCount.length; k++) { - alleleCounts.add(mlAlleleCount[k]); - final double freq = (double)mlAlleleCount[k] / (double)numChromosomes; - alleleFreqs.add(freq); - - } - - // per-pool logging of AC and AF - gb.attribute(VCFConstants.MLE_PER_SAMPLE_ALLELE_COUNT_KEY, alleleCounts.size() == 1 ? alleleCounts.get(0) : alleleCounts); - gb.attribute(VCFConstants.MLE_PER_SAMPLE_ALLELE_FRACTION_KEY, alleleFreqs.size() == 1 ? alleleFreqs.get(0) : alleleFreqs); - - // remove PLs if necessary - if (newLikelihoods.length > MAX_LENGTH_FOR_POOL_PL_LOGGING) - gb.noPL(); - - ArrayList myAlleles = new ArrayList(); - - // add list of called ML genotypes to alleles list - // TODO - too unwieldy? - int idx = 0; - for (int mlind = 0; mlind < mlAlleleCount.length; mlind++) { - for (int k=0; k < mlAlleleCount[mlind]; k++) - myAlleles.add(idx++,allelesToUse.get(mlind)); - } - gb.alleles(myAlleles); - - if ( numNewAltAlleles > 0 ) - gb.log10PError(GenotypeLikelihoods.getGQLog10FromLikelihoods(PLindex, newLikelihoods)); - } - -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java deleted file mode 100644 index af5c79230..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java +++ /dev/null @@ -1,426 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.variant.variantcontext.*; - -import java.util.*; - -/** - * Computes the conditional bi-allelic exact results - * - * Suppose vc contains 2 alt allele: A* with C and T. This function first computes: - * - * (1) P(D | AF_c > 0 && AF_t == *) [i.e., T can be anything] - * - * it then computes the conditional probability on AF_c == 0: - * - * (2) P(D | AF_t > 0 && AF_c == 0) - * - * Thinking about this visually, we have the following likelihood matrix where each cell is - * the P(D | AF_c == i && AF_t == j): - * - * 0 AF_c > 0 - * ----------------- - * 0 | | - * |--|------------- - * a | | - * f | | - * _ | | - * t | | - * > | | - * 0 | | - * - * What we really want to know how - * - * (3) P(D | AF_c == 0 & AF_t == 0) - * - * compares with - * - * (4) P(D | AF_c > 0 || AF_t > 0) - * - * This is effectively asking for the value in the upper left vs. the sum of all cells. - * - * This class implements the conditional likelihoods summation for any number of alt - * alleles, where each alt allele has its EXACT probability of segregating calculated by - * reducing each alt B into the case XB and computing P(D | AF_b > 0 ) as follows: - * - * Suppose we have for a A/B/C site the following GLs: - * - * AA AB BB AC BC CC - * - * and we want to get the bi-allelic GLs for X/B, where X is everything not B - * - * XX = AA + AC + CC (since X = A or C) - * XB = AB + BC - * BB = BB - * - * After each allele has its probability calculated we compute the joint posterior - * as P(D | AF_* == 0) = prod_i P (D | AF_i == 0), after applying the theta^i - * prior for the ith least likely allele. - */ - public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc { - /** - * The min. confidence of an allele to be included in the joint posterior. - */ - private final static double MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR = Math.log10(1e-10); - - private final static int[] BIALLELIC_NON_INFORMATIVE_PLS = new int[]{0,0,0}; - private final static List BIALLELIC_NOCALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); - - /** - * Sorts AFCalcResults by their posteriors of AF > 0, so the - */ - private final static class CompareAFCalcResultsByPNonRef implements Comparator { - @Override - public int compare(AFCalcResult o1, AFCalcResult o2) { - return -1 * Double.compare(o1.getLog10PosteriorOfAFGT0(), o2.getLog10PosteriorOfAFGT0()); - } - } - - private final static CompareAFCalcResultsByPNonRef compareAFCalcResultsByPNonRef = new CompareAFCalcResultsByPNonRef(); - - /** - * The AFCalc model we are using to do the bi-allelic computation - */ - final AFCalc biAlleleExactModel; - - protected IndependentAllelesDiploidExactAFCalc(int nSamples, int maxAltAlleles, final int ploidy) { - super(nSamples, maxAltAlleles, ploidy); - biAlleleExactModel = new ReferenceDiploidExactAFCalc(nSamples, 1, ploidy); - } - - /** - * Trivial subclass that helps with debugging by keeping track of the supporting information for this joint call - */ - private static class MyAFCalcResult extends AFCalcResult { - /** - * List of the supporting bi-allelic AFCalcResults that went into making this multi-allelic joint call - */ - final List supporting; - - private MyAFCalcResult(int[] alleleCountsOfMLE, int nEvaluations, List allelesUsedInGenotyping, double[] log10LikelihoodsOfAC, double[] log10PriorsOfAC, Map log10pRefByAllele, List supporting) { - super(alleleCountsOfMLE, nEvaluations, allelesUsedInGenotyping, log10LikelihoodsOfAC, log10PriorsOfAC, log10pRefByAllele); - this.supporting = supporting; - } - } - - @Override - public AFCalcResult computeLog10PNonRef(final VariantContext vc, - final double[] log10AlleleFrequencyPriors) { - final List independentResultTrackers = computeAlleleIndependentExact(vc, log10AlleleFrequencyPriors); - - if ( independentResultTrackers.size() == 0 ) - throw new IllegalStateException("Independent alleles model returned an empty list of results at VC " + vc); - - if ( independentResultTrackers.size() == 1 ) { - // fast path for the very common bi-allelic use case - return independentResultTrackers.get(0); - } else { - // we are a multi-allelic, so we need to actually combine the results - final List withMultiAllelicPriors = applyMultiAllelicPriors(independentResultTrackers); - return combineIndependentPNonRefs(vc, withMultiAllelicPriors); - } - } - - /** - * Compute the conditional exact AFCalcResult for each allele in vc independently, returning - * the result of each, in order of the alt alleles in VC - * - * @param vc the VariantContext we want to analyze, with at least 1 alt allele - * @param log10AlleleFrequencyPriors the priors - * @return a list of the AFCalcResults for each bi-allelic sub context of vc - */ - @Requires({"vc != null", "log10AlleleFrequencyPriors != null"}) - @Ensures("goodIndependentResult(vc, result)") - protected final List computeAlleleIndependentExact(final VariantContext vc, - final double[] log10AlleleFrequencyPriors) { - final List results = new LinkedList(); - - for ( final VariantContext subvc : makeAlleleConditionalContexts(vc) ) { - final AFCalcResult resultTracker = biAlleleExactModel.getLog10PNonRef(subvc, log10AlleleFrequencyPriors); - results.add(resultTracker); - } - - return results; - } - - /** - * Helper function to ensure that the computeAlleleIndependentExact is returning reasonable results - */ - private static boolean goodIndependentResult(final VariantContext vc, final List results) { - if ( results.size() != vc.getNAlleles() - 1) return false; - for ( int i = 0; i < results.size(); i++ ) { - if ( results.get(i).getAllelesUsedInGenotyping().size() != 2 ) - return false; - if ( ! results.get(i).getAllelesUsedInGenotyping().contains(vc.getAlternateAllele(i)) ) - return false; - } - - return true; - } - - /** - * Returns the bi-allelic variant context for each alt allele in vc with bi-allelic likelihoods, in order - * - * @param vc the variant context to split. Must have n.alt.alleles > 1 - * @return a bi-allelic variant context for each alt allele in vc - */ - @Requires({"vc != null", "vc.getNAlleles() > 1"}) - @Ensures("result.size() == vc.getNAlleles() - 1") - protected final List makeAlleleConditionalContexts(final VariantContext vc) { - final int nAltAlleles = vc.getNAlleles() - 1; - - if ( nAltAlleles == 1 ) { - // fast path for bi-allelic case. - return Collections.singletonList(vc); - } else { - // go through the work of ripping up the VC into its biallelic components - final List vcs = new LinkedList(); - - for ( int altI = 0; altI < nAltAlleles; altI++ ) { - vcs.add(biallelicCombinedGLs(vc, altI + 1)); - } - - return vcs; - } - } - - /** - * Create a single bi-allelic variant context from rootVC with alt allele with index altAlleleIndex - * - * @param rootVC the root (potentially multi-allelic) variant context - * @param altAlleleIndex index of the alt allele, from 0 == first alt allele - * @return a bi-allelic variant context based on rootVC - */ - @Requires({"rootVC.getNAlleles() > 1", "altAlleleIndex < rootVC.getNAlleles()"}) - @Ensures({"result.isBiallelic()"}) - protected final VariantContext biallelicCombinedGLs(final VariantContext rootVC, final int altAlleleIndex) { - if ( rootVC.isBiallelic() ) { - return rootVC; - } else { - final int nAlts = rootVC.getNAlleles() - 1; - final List biallelicGenotypes = new ArrayList(rootVC.getNSamples()); - for ( final Genotype g : rootVC.getGenotypes() ) - biallelicGenotypes.add(combineGLs(g, altAlleleIndex, nAlts)); - - final VariantContextBuilder vcb = new VariantContextBuilder(rootVC); - final Allele altAllele = rootVC.getAlternateAllele(altAlleleIndex - 1); - vcb.alleles(Arrays.asList(rootVC.getReference(), altAllele)); - vcb.genotypes(biallelicGenotypes); - return vcb.make(); - } - } - - /** - * Returns a new Genotype with the PLs of the multi-allelic original reduced to a bi-allelic case - * - * This is handled in the following way: - * - * Suppose we have for a A/B/C site the following GLs: - * - * AA AB BB AC BC CC - * - * and we want to get the bi-allelic GLs for X/B, where X is everything not B - * - * XX = AA + AC + CC (since X = A or C) - * XB = AB + BC - * BB = BB - * - * @param original the original multi-allelic genotype - * @param altIndex the index of the alt allele we wish to keep in the bialleic case -- with ref == 0 - * @param nAlts the total number of alt alleles - * @return a new biallelic genotype with appropriate PLs - */ - @Requires({"original.hasLikelihoods()"}) // TODO -- add ploidy == 2 test "original.getPLs() == null || original.getPLs().length == 3"}) - @Ensures({"result.hasLikelihoods()", "result.getPL().length == 3"}) - protected Genotype combineGLs(final Genotype original, final int altIndex, final int nAlts ) { - if ( original.isNonInformative() ) - return new GenotypeBuilder(original).PL(BIALLELIC_NON_INFORMATIVE_PLS).alleles(BIALLELIC_NOCALL).make(); - - if ( altIndex < 1 || altIndex > nAlts ) throw new IllegalStateException("altIndex must be between 1 and nAlts " + nAlts); - - final double[] normalizedPr = MathUtils.normalizeFromLog10(GenotypeLikelihoods.fromPLs(original.getPL()).getAsVector()); - final double[] biAllelicPr = new double[3]; - - for ( int index = 0; index < normalizedPr.length; index++ ) { - final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair = GenotypeLikelihoods.getAllelePair(index); - - if ( pair.alleleIndex1 == altIndex ) { - if ( pair.alleleIndex2 == altIndex ) - // hom-alt case - biAllelicPr[2] = normalizedPr[index]; - else - // het-alt case - biAllelicPr[1] += normalizedPr[index]; - } else { - if ( pair.alleleIndex2 == altIndex ) - // het-alt case - biAllelicPr[1] += normalizedPr[index]; - else - // hom-non-alt - biAllelicPr[0] += normalizedPr[index]; - } - } - - final double[] GLs = new double[3]; - for ( int i = 0; i < GLs.length; i++ ) GLs[i] = Math.log10(biAllelicPr[i]); - - return new GenotypeBuilder(original).PL(GLs).alleles(BIALLELIC_NOCALL).make(); - } - - protected final List applyMultiAllelicPriors(final List conditionalPNonRefResults) { - final ArrayList sorted = new ArrayList(conditionalPNonRefResults); - - // sort the results, so the most likely allele is first - Collections.sort(sorted, compareAFCalcResultsByPNonRef); - - double lastPosteriorGt0 = sorted.get(0).getLog10PosteriorOfAFGT0(); - final double log10SingleAllelePriorOfAFGt0 = conditionalPNonRefResults.get(0).getLog10PriorOfAFGT0(); - - for ( int i = 0; i < sorted.size(); i++ ) { - if ( sorted.get(i).getLog10PosteriorOfAFGT0() > lastPosteriorGt0 ) - throw new IllegalStateException("pNonRefResults not sorted: lastPosteriorGt0 " + lastPosteriorGt0 + " but current is " + sorted.get(i).getLog10PosteriorOfAFGT0()); - - final double log10PriorAFGt0 = (i + 1) * log10SingleAllelePriorOfAFGt0; - final double log10PriorAFEq0 = Math.log10(1 - Math.pow(10, log10PriorAFGt0)); - final double[] thetaTONPriors = new double[] { log10PriorAFEq0, log10PriorAFGt0 }; - - // bind pNonRef for allele to the posterior value of the AF > 0 with the new adjusted prior - sorted.set(i, sorted.get(i).withNewPriors(MathUtils.normalizeFromLog10(thetaTONPriors, true))); - } - - return sorted; - } - - - /** - * Take the independent estimates of pNonRef for each alt allele and combine them into a single result - * - * Given n independent calculations for each of n alternate alleles create a single - * combined AFCalcResult with: - * - * priors for AF == 0 equal to theta^N for the nth least likely allele - * posteriors that reflect the combined chance that any alleles are segregating and corresponding - * likelihoods - * combined MLEs in the order of the alt alleles in vc - * - * @param sortedResultsWithThetaNPriors the pNonRef result for each allele independently - */ - protected AFCalcResult combineIndependentPNonRefs(final VariantContext vc, - final List sortedResultsWithThetaNPriors) { - int nEvaluations = 0; - final int nAltAlleles = sortedResultsWithThetaNPriors.size(); - final int[] alleleCountsOfMLE = new int[nAltAlleles]; - final double[] log10PriorsOfAC = new double[2]; - final Map log10pRefByAllele = new HashMap(nAltAlleles); - - // the sum of the log10 posteriors for AF == 0 and AF > 0 to determine joint probs - double log10PosteriorOfACEq0Sum = 0.0; - double log10PosteriorOfACGt0Sum = 0.0; - - boolean anyPoly = false; - for ( final AFCalcResult sortedResultWithThetaNPriors : sortedResultsWithThetaNPriors ) { - final Allele altAllele = sortedResultWithThetaNPriors.getAllelesUsedInGenotyping().get(1); - final int altI = vc.getAlleles().indexOf(altAllele) - 1; - - // MLE of altI allele is simply the MLE of this allele in altAlleles - alleleCountsOfMLE[altI] = sortedResultWithThetaNPriors.getAlleleCountAtMLE(altAllele); - - // the AF > 0 case requires us to store the normalized likelihood for later summation - if ( sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0() > MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR ) { - anyPoly = true; - log10PosteriorOfACEq0Sum += sortedResultWithThetaNPriors.getLog10PosteriorOfAFEq0(); - log10PriorsOfAC[0] += sortedResultWithThetaNPriors.getLog10PriorOfAFEq0(); - log10PriorsOfAC[1] += sortedResultWithThetaNPriors.getLog10PriorOfAFGT0(); - } - - log10PosteriorOfACGt0Sum += sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0(); - - // bind pNonRef for allele to the posterior value of the AF > 0 with the new adjusted prior - log10pRefByAllele.put(altAllele, sortedResultWithThetaNPriors.getLog10PosteriorOfAFEq0()); - - // trivial -- update the number of evaluations - nEvaluations += sortedResultWithThetaNPriors.nEvaluations; - } - - // If no alleles were polymorphic, make sure we have the proper priors (the defaults) for likelihood calculation - if ( ! anyPoly ) { - log10PriorsOfAC[0] = sortedResultsWithThetaNPriors.get(0).getLog10PriorOfAFEq0(); - log10PriorsOfAC[1] = sortedResultsWithThetaNPriors.get(0).getLog10PriorOfAFGT0(); - } - - // In principle, if B_p = x and C_p = y are the probabilities of being poly for alleles B and C, - // the probability of being poly is (1 - B_p) * (1 - C_p) = (1 - x) * (1 - y). We want to estimate confidently - // log10((1 - x) * (1 - y)) which is log10(1 - x) + log10(1 - y). This sum is log10PosteriorOfACEq0 - // - // note we need to handle the case where the posterior of AF == 0 is 0.0, in which case we - // use the summed log10PosteriorOfACGt0Sum directly. This happens in cases where - // AF > 0 : 0.0 and AF == 0 : -16, and if you use the inverse calculation you get 0.0 and MathUtils.LOG10_P_OF_ZERO - final double log10PosteriorOfACGt0; - if ( log10PosteriorOfACEq0Sum == 0.0 ) - log10PosteriorOfACGt0 = log10PosteriorOfACGt0Sum; - else - log10PosteriorOfACGt0 = Math.max(Math.log10(1 - Math.pow(10, log10PosteriorOfACEq0Sum)), MathUtils.LOG10_P_OF_ZERO); - - final double[] log10LikelihoodsOfAC = new double[] { - // L + prior = posterior => L = poster - prior - log10PosteriorOfACEq0Sum - log10PriorsOfAC[0], - log10PosteriorOfACGt0 - log10PriorsOfAC[1] - }; - - return new MyAFCalcResult(alleleCountsOfMLE, nEvaluations, vc.getAlleles(), - // necessary to ensure all values < 0 - MathUtils.normalizeFromLog10(log10LikelihoodsOfAC, true), - // priors incorporate multiple alt alleles, must be normalized - MathUtils.normalizeFromLog10(log10PriorsOfAC, true), - log10pRefByAllele, sortedResultsWithThetaNPriors); - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTrimmer.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTrimmer.java deleted file mode 100644 index f1db5bcd7..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTrimmer.java +++ /dev/null @@ -1,151 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.activeregion.ActiveRegion; -import org.broadinstitute.variant.variantcontext.VariantContext; - -import java.util.LinkedList; -import java.util.List; -import java.util.TreeSet; - -/** - * Trim down an active region based on a set of variants found across the haplotypes within the region - * - * User: depristo - * Date: 4/27/13 - * Time: 2:10 PM - */ -class ActiveRegionTrimmer { - private final static Logger logger = Logger.getLogger(ActiveRegionTrimmer.class); - private final boolean logTrimming; - private final int snpPadding, nonSnpPadding, maxDistanceInExtensionForGenotyping; - private final GenomeLocParser parser; - - /** - * Create a new ActiveRegionTrimmer - * - * @param logTrimming should we log our trimming events? - * @param snpPadding how much bp context should we ensure around snps? - * @param nonSnpPadding how much bp context should we ensure around anything not a snp? - * @param maxDistanceInExtensionForGenotyping the max extent we are will to go into the extended region of the - * origin active region in order to properly genotype events in the - * non-extended active region? - * @param parser a genome loc parser so we can create genome locs - */ - ActiveRegionTrimmer(boolean logTrimming, int snpPadding, int nonSnpPadding, int maxDistanceInExtensionForGenotyping, GenomeLocParser parser) { - if ( snpPadding < 0 ) throw new IllegalArgumentException("snpPadding must be >= 0 but got " + snpPadding); - if ( nonSnpPadding < 0 ) throw new IllegalArgumentException("nonSnpPadding must be >= 0 but got " + nonSnpPadding); - if ( maxDistanceInExtensionForGenotyping < 0 ) throw new IllegalArgumentException("maxDistanceInExtensionForGenotyping must be >= 0 but got " + maxDistanceInExtensionForGenotyping); - if ( parser == null ) throw new IllegalArgumentException("parser cannot be null"); - - logger.debug("Trimmer created with parameters " + logTrimming + " " + snpPadding + " " + nonSnpPadding + " " + maxDistanceInExtensionForGenotyping); - this.logTrimming = logTrimming; - this.snpPadding = snpPadding; - this.nonSnpPadding = nonSnpPadding; - this.maxDistanceInExtensionForGenotyping = maxDistanceInExtensionForGenotyping; - this.parser = parser; - } - - /** - * Trim down the active region to a region large enough to properly genotype the events found within the active - * region span, excluding all variants that only occur within its extended span. - * - * This function merely creates the region, but it doesn't populate the reads back into the region. - * - * @param region our full active region - * @param allVariantsWithinExtendedRegion all of the variants found in the entire region, sorted by their start position - * @param emitReferenceConfidence are we going to estimate the reference confidence with this active region? - * @return a new ActiveRegion trimmed down to just what's needed for genotyping, or null if we couldn't do this successfully - */ - public ActiveRegion trimRegion(final ActiveRegion region, final TreeSet allVariantsWithinExtendedRegion, final boolean emitReferenceConfidence) { - - if ( allVariantsWithinExtendedRegion.isEmpty() ) // no variants, so just return the current region - return null; - - final List withinActiveRegion = new LinkedList<>(); - boolean foundNonSnp = false; - GenomeLoc trimLoc = null; - for ( final VariantContext vc : allVariantsWithinExtendedRegion ) { - final GenomeLoc vcLoc = parser.createGenomeLoc(vc); - if ( region.getLocation().overlapsP(vcLoc) ) { - if ( ! vc.isSNP() ) // if anything isn't a SNP use the bigger padding - foundNonSnp = true; - trimLoc = trimLoc == null ? vcLoc : trimLoc.endpointSpan(vcLoc); - withinActiveRegion.add(vc); - } - } - final int pad = ( emitReferenceConfidence || foundNonSnp ? nonSnpPadding : snpPadding ); - - // we don't actually have anything in the region after removing variants that don't overlap the region's full location - if ( trimLoc == null ) return null; - -// final GenomeLoc maxSpan = parser.createPaddedGenomeLoc(region.getLocation(), maxDistanceInExtensionForGenotyping); - // Try to have one kmer before and after any event. - - final GenomeLoc regionLoc = region.getLocation(); - final GenomeLoc maxSpan = parser.createPaddedGenomeLoc(region.getLocation(), maxDistanceInExtensionForGenotyping); - final GenomeLoc idealSpan = parser.createPaddedGenomeLoc(trimLoc, pad); - final GenomeLoc finalSpan = maxSpan.intersect(idealSpan); - - final ActiveRegion trimmedRegion = region.trim(finalSpan); - if ( logTrimming ) { - logger.info("events : " + withinActiveRegion); - logger.info("region : " + regionLoc); - logger.info("trimLoc : " + trimLoc); - logger.info("pad : " + pad); - logger.info("idealSpan : " + idealSpan); - logger.info("maxSpan : " + maxSpan); - logger.info("finalSpan : " + finalSpan); - logger.info("regionSpan : " + trimmedRegion.getExtendedLoc() + " size is " + trimmedRegion.getExtendedLoc().size()); - } - return trimmedRegion; - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResultSet.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResultSet.java deleted file mode 100644 index 091c09e8d..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResultSet.java +++ /dev/null @@ -1,466 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.ReadThreadingGraph; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.activeregion.ActiveRegion; -import org.broadinstitute.sting.utils.collections.CountSet; -import org.broadinstitute.sting.utils.collections.CountSet; -import org.broadinstitute.sting.utils.haplotype.Haplotype; - -import java.io.PrintWriter; -import java.io.StringWriter; -import java.util.*; - -/** - * Collection of read assembly using several kmerSizes. - * - *

- * There could be a different assembly per each kmerSize. In turn, haplotypes are result of one of those - * assemblies. - *

- * - *

- * Where there is more than one possible kmerSize that generates a haplotype we consider the smaller one. - *

- * - * @author Valentin Ruano-Rubio <valentin@broadinstitute.com> - */ -public class AssemblyResultSet { - - private final Map assemblyResultByKmerSize; - private final Set haplotypes; - private final Map assemblyResultByHaplotype; - private ActiveRegion regionForGenotyping; - private byte[] fullReferenceWithPadding; - private GenomeLoc paddedReferenceLoc; - private boolean variationPresent; - private Haplotype refHaplotype; - private boolean wasTrimmed = false; - private final CountSet kmerSizes; - - /** - * Constructs a new empty assembly result set. - */ - public AssemblyResultSet() { - assemblyResultByKmerSize = new LinkedHashMap<>(4); - haplotypes = new LinkedHashSet<>(10); - assemblyResultByHaplotype = new LinkedHashMap<>(10); - kmerSizes = new CountSet(4); - } - - /** - * Trims an assembly result set down based on a new set of trimmed haplotypes. - * - * @param originalByTrimmedHaplotypes map from trimmed to original haplotypes. - * @param trimmedActiveRegion the trimmed down active region. - * - * @throws NullPointerException if any argument in {@code null} or - * if there are {@code null} entries in {@code originalByTrimmedHaplotypes} for trimmed haplotype keys. - * @throws IllegalArgumentException if there is no reference haplotype amongst the trimmed ones. - * - * - * @return never {@code null}, a new trimmed assembly result set. - */ - public AssemblyResultSet trimTo(final ActiveRegion trimmedActiveRegion, - final Map originalByTrimmedHaplotypes) { - if (refHaplotype == null) throw new IllegalStateException(); - if (trimmedActiveRegion == null) throw new NullPointerException(); - final AssemblyResultSet result = new AssemblyResultSet(); - - for (final Haplotype trimmed : originalByTrimmedHaplotypes.keySet()) { - final Haplotype original = originalByTrimmedHaplotypes.get(trimmed); - if (original == null) - throw new NullPointerException("all trimmed haplotypes must have an original one"); - final AssemblyResult as = assemblyResultByHaplotype.get(original); - if (as == null) result.add(trimmed); else result.add(trimmed, as); - } - - result.setRegionForGenotyping(trimmedActiveRegion); - result.setFullReferenceWithPadding(this.fullReferenceWithPadding); - result.setPaddedReferenceLoc(this.paddedReferenceLoc); - if (result.refHaplotype == null) - throw new IllegalStateException("missing reference haplotype in the trimmed set"); - result.wasTrimmed = true; - return result; - } - - /** - * Query the reference haplotype in the result set. - * @return {@code null} if none wasn't yet added, otherwise a reference haplotype. - */ - public Haplotype getReferenceHaplotype() { - return refHaplotype; - } - - /** - * Checks whether there is any variation present in the assembly result set. - * - *

- * This is equivalent to whether there is more than one haplotype. - *

- * - * @return {@code true} if there is variation present, {@code false} otherwise. - */ - public boolean isVariationPresent() { - return variationPresent && haplotypes.size() > 1; - } - - /** - * Dumps debugging information into a print-writer. - * - * @param pw where to dump the information. - * - * @throws NullPointerException if {@code pw} is {@code null}. - */ - public void debugDump(final PrintWriter pw) { - if (getHaplotypeList().size() == 0) { - return; - } - pw.println("Active Region " + this.regionForGenotyping.getLocation()); - pw.println("Extended Act Region " + this.getRegionForGenotyping().getExtendedLoc()); - pw.println("Ref haplotype coords " + getHaplotypeList().get(0).getGenomeLocation()); - pw.println("Haplotype count " + haplotypes.size()); - final Map kmerSizeToCount = new HashMap<>(); - - for (final Map.Entry e : assemblyResultByHaplotype.entrySet()) { - final AssemblyResult as = e.getValue(); - final int kmerSize = as.getGraph().getKmerSize(); - if (kmerSizeToCount.containsKey(kmerSize)) { - kmerSizeToCount.put(kmerSize,kmerSizeToCount.get(kmerSize) + 1); - } else { - kmerSizeToCount.put(kmerSize,1); - } - } - pw.println("Kmer sizes count " + kmerSizeToCount.entrySet().size() ); - Integer[] kmerSizes = new Integer[kmerSizeToCount.size()]; - kmerSizes = kmerSizeToCount.keySet().toArray(kmerSizes); - Arrays.sort(kmerSizes); - pw.println("Kmer sizes values " + Arrays.toString(kmerSizes)); - for (int size : kmerSizes) { - pw.println("Kmer size " + size + " count " + kmerSizeToCount.get(size)); - } - } - - /** - * Adds a haplotype to the result set without indicating a generating assembly result. - * - *

- * It is possible to call this method with the same haplotype several times. In that the second and further - * calls won't have any effect (thus returning {@code false}). - *

- * - * @param h the haplotype to add to the assembly result set. - * - * @throws NullPointerException if {@code h} is {@code null} - * @throws IllegalArgumentException if {@code h} does not have a genome location. - * - * @return {@code true} if the assembly result set has been modified as a result of this call. - */ - public boolean add(final Haplotype h) { - if (h == null) throw new NullPointerException("input haplotype cannot be null"); - if (h.getGenomeLocation() == null) - throw new IllegalArgumentException("the haplotype provided must have a genomic location"); - if (haplotypes.contains(h)) - return false; - haplotypes.add(h); - updateReferenceHaplotype(h); - return true; - } - - /** - * Adds simultaneously a haplotype and the generating assembly-result. - * - *

- * Haplotypes and their assembly-result can be added multiple times although just the first call will have - * any effect (return value is {@code true}). - *

- * - * - * @param h haplotype to add. - * @param ar assembly-result that is assumed to have given rise to that haplotype. - * - * @throws NullPointerException if {@code h} or {@code ar} is {@code null}. - * @throws IllegalArgumentException if {@code h} has not defined genome location. - * - * @return {@code true} iff this called changes the assembly result set. - */ - public boolean add(final Haplotype h, final AssemblyResult ar) { - if (h == null) throw new NullPointerException("input haplotype cannot be null"); - if (ar == null) throw new NullPointerException("input assembly-result cannot be null"); - if (h.getGenomeLocation() == null) - throw new IllegalArgumentException("the haplotype provided must have a genomic location"); - - final boolean assemblyResultAdditionReturn = add(ar); - - if (haplotypes.contains(h)) { - final AssemblyResult previousAr = assemblyResultByHaplotype.get(h); - if (previousAr == null) { - assemblyResultByHaplotype.put(h, ar); - return true; - } else if (!previousAr.equals(ar)) - throw new IllegalStateException("there is already a different assembly result for the input haplotype"); - else - return assemblyResultAdditionReturn; - } else { - haplotypes.add(h); - assemblyResultByHaplotype.put(h,ar); - updateReferenceHaplotype(h); - if (h.isNonReference()) variationPresent = true; - return true; - } - } - - /** - * Add a assembly-result object. - * - * @param ar the assembly result to add. - * - * @throws NullPointerException if {@code ar} is {@code null}. - * @throws IllegalStateException if there is an assembly result with the same kmerSize. - * @return {@code true} iff this addition changed the assembly result set. - */ - public boolean add(final AssemblyResult ar) { - if (ar == null) - throw new NullPointerException(); - final int kmerSize = ar.getKmerSize(); - if (assemblyResultByKmerSize.containsKey(kmerSize)) { - if (!assemblyResultByKmerSize.get(kmerSize).equals(ar)) - throw new IllegalStateException("a different assembly result with the same kmerSize was already added"); - return false; - } else { - assemblyResultByKmerSize.put(kmerSize, ar); - kmerSizes.add(kmerSize); - return true; - } - } - - /** - * Returns the current region for genotyping. - * - * @return might be {@code null}. - */ - public ActiveRegion getRegionForGenotyping() { - return regionForGenotyping; - } - - /** - * Sets the region for genotyping. - * - * @param regionForGenotyping the new value. - */ - public void setRegionForGenotyping(final ActiveRegion regionForGenotyping) { - this.regionForGenotyping = regionForGenotyping; - } - - /** - * Returns the current full reference with padding. - * - * @return might be {@code null}. - */ - public byte[] getFullReferenceWithPadding() { - return fullReferenceWithPadding; - } - - /** - * Sets the full reference with padding base sequence. - * - * @param fullReferenceWithPadding the new value. - */ - public void setFullReferenceWithPadding(final byte[] fullReferenceWithPadding) { - this.fullReferenceWithPadding = fullReferenceWithPadding; - } - - /** - * Returns the padded reference location. - * - * @return might be {@code null} - */ - public GenomeLoc getPaddedReferenceLoc() { - return paddedReferenceLoc; - } - - /** - * Changes the padded reference location. - * @param paddedReferenceLoc the new value. - */ - public void setPaddedReferenceLoc(final GenomeLoc paddedReferenceLoc) { - this.paddedReferenceLoc = paddedReferenceLoc; - } - - /** - * Returns the number of haplotypes in the assembly result set. - * @return {@code 0} or greater. - */ - public int getHaplotypeCount() { - return haplotypes.size(); - } - - /** - * Returns the haplotypes as a list. - * - *

- * The result is unmodifiable. - *

- * - * @return never {@code null}, but perhaps a empty list if no haplotype was generated during assembly. - */ - public List getHaplotypeList() { - return Arrays.asList(haplotypes.toArray(new Haplotype[haplotypes.size()])); - } - - /** - * Returns the maximum kmerSize available. - * - * @throws IllegalStateException if no assembly-result was added to the set, thus there is no kmerSize. - * - * @return greater than 0. - */ - public int getMaximumKmerSize() { - if (kmerSizes.size() == 0) - throw new IllegalStateException("there is yet no kmerSize in this assembly result set"); - return kmerSizes.max(); - } - - /** - * Indicates whether there are more than one kmerSize in the set. - * - * @return {@code true} iff there is more than one kmerSize assembly in the set. - */ - public boolean hasMultipleKmerSizes() { - return kmerSizes.size() > 1; - } - - /** - * Returns the minimum kmerSize available. - * - * @throws IllegalStateException if no assembly-result was added to the set, thus there is no kmerSize. - * - * @return greater than 0. - */ - public int getMinimumKmerSize() { - if (kmerSizes.size() == 0) - throw new IllegalStateException("there is yet no kmerSize in this assembly result set"); - return kmerSizes.min(); - } - - /** - * Returns a read-threading graph in the assembly set that has a particular kmerSize. - * - * @param kmerSize the requested kmerSize. - * - * @return {@code null} if there is no read-threading-graph amongst assembly results with that kmerSize. - */ - public ReadThreadingGraph getUniqueReadThreadingGraph(final int kmerSize) { - final AssemblyResult assemblyResult = assemblyResultByKmerSize.get(kmerSize); - if (assemblyResult == null) return null; - return assemblyResult.getThreadingGraph(); - } - - /** - * Checks whether this assembly result set was trimmed. - * - * @return {@code true} iff this assembly result set was trimmed. - */ - public boolean wasTrimmed() { - return wasTrimmed; - } - - /** - * Marks the assembly as not having variation even if it has more than one haplotype. - */ - public void resetVariationPresent() { - variationPresent = false; - } - - /** - * Dumps debugging information into a logger. - * - * @param logger where to dump the information. - * - * @throws NullPointerException if {@code logger} is {@code null}. - */ - public void debugDump(final Logger logger) { - final StringWriter sw = new StringWriter(); - final PrintWriter pw = new PrintWriter(sw); - debugDump(pw); - final String str = sw.toString(); - final String[] lines = str.split("\n"); - for (final String line : lines) { - if (line.isEmpty()) { - continue; - } - logger.debug(line); - } - } - - /** - * Given whether a new haplotype that has been already added to {@link #haplotypes} collection is the - * reference haplotype and updates {@link #refHaplotype} accordingly. - * - *

- * This method assumes that the colling code has verified that the haplotype was not already in {@link #haplotypes} - * I.e. that it is really a new one. Otherwise it will result in an exception if it happen to be a reference - * haplotype and this has already be set. This is the case even if the new haplotypes and the current reference - * are equal. - *

- * - * @param newHaplotype the new haplotype. - * @throws NullPointerException if {@code newHaplotype} is {@code null}. - * @throws IllegalStateException if there is already a reference haplotype. - */ - private void updateReferenceHaplotype(final Haplotype newHaplotype) { - if (!newHaplotype.isReference()) return; - if (refHaplotype == null) - refHaplotype = newHaplotype; - else // assumes that we have checked wether the haplotype is already in the collection and so is no need to check equality. - throw new IllegalStateException("the assembly-result-set already have a reference haplotype that is different"); - } - -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java deleted file mode 100644 index 139f2e07d..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ /dev/null @@ -1,521 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; -import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.collections.DefaultHashMap; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.sting.utils.haplotype.EventMap; -import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.haplotype.MergeVariantsAcrossHaplotypes; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.variantcontext.*; - -import java.util.*; - -public class GenotypingEngine { - private final static Logger logger = Logger.getLogger(GenotypingEngine.class); - - private final boolean DEBUG; - private final boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS; - private final static List noCall = new ArrayList<>(); // used to noCall all genotypes until the exact model is applied - private final VariantAnnotatorEngine annotationEngine; - private final MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger; - - public GenotypingEngine( final boolean DEBUG, final VariantAnnotatorEngine annotationEngine, - final boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS, - final MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger) { - this.DEBUG = DEBUG; - this.annotationEngine = annotationEngine; - this.USE_FILTERED_READ_MAP_FOR_ANNOTATIONS = USE_FILTERED_READ_MAP_FOR_ANNOTATIONS; - noCall.add(Allele.NO_CALL); - this.crossHaplotypeEventMerger = crossHaplotypeEventMerger; - } - - /** - * Carries the result of a call to #assignGenotypeLikelihoods - */ - public static class CalledHaplotypes { - private final List calls; - private final Set calledHaplotypes; - - protected CalledHaplotypes(final List calls, final Set calledHaplotypes) { - if ( calls == null ) throw new IllegalArgumentException("calls cannot be null"); - if ( calledHaplotypes == null ) throw new IllegalArgumentException("calledHaplotypes cannot be null"); - if ( Utils.xor(calls.isEmpty(), calledHaplotypes.isEmpty()) ) - throw new IllegalArgumentException("Calls and calledHaplotypes should both be empty or both not but got calls=" + calls + " calledHaplotypes=" + calledHaplotypes); - this.calls = calls; - this.calledHaplotypes = calledHaplotypes; - } - - /** - * Get the list of calls made at this location - * @return a non-null (but potentially empty) list of calls - */ - public List getCalls() { - return calls; - } - - /** - * Get the set of haplotypes that we actually called (i.e., underlying one of the VCs in getCalls(). - * @return a non-null set of haplotypes - */ - public Set getCalledHaplotypes() { - return calledHaplotypes; - } - } - - /** - * Main entry point of class - given a particular set of haplotypes, samples and reference context, compute - * genotype likelihoods and assemble into a list of variant contexts and genomic events ready for calling - * - * The list of samples we're working with is obtained from the haplotypeReadMap - * - * @param UG_engine UG Engine with basic input parameters - * @param haplotypes Haplotypes to assign likelihoods to - * @param haplotypeReadMap Map from reads->(haplotypes,likelihoods) - * @param perSampleFilteredReadList - * @param ref Reference bytes at active region - * @param refLoc Corresponding active region genome location - * @param activeRegionWindow Active window - * @param genomeLocParser GenomeLocParser - * @param activeAllelesToGenotype Alleles to genotype - * @return A CalledHaplotypes object containing a list of VC's with genotyped events and called haplotypes - */ - @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"}) - @Ensures("result != null") - // TODO - can this be refactored? this is hard to follow! - public CalledHaplotypes assignGenotypeLikelihoods( final UnifiedGenotyperEngine UG_engine, - final List haplotypes, - final Map haplotypeReadMap, - final Map> perSampleFilteredReadList, - final byte[] ref, - final GenomeLoc refLoc, - final GenomeLoc activeRegionWindow, - final GenomeLocParser genomeLocParser, - final RefMetaDataTracker tracker, - final List activeAllelesToGenotype ) { - // sanity check input arguments - if (UG_engine == null) throw new IllegalArgumentException("UG_Engine input can't be null, got "+UG_engine); - if (haplotypes == null || haplotypes.isEmpty()) throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes); - if (haplotypeReadMap == null || haplotypeReadMap.isEmpty()) throw new IllegalArgumentException("haplotypeReadMap input should be non-empty and non-null, got "+haplotypeReadMap); - if (ref == null || ref.length == 0 ) throw new IllegalArgumentException("ref bytes input should be non-empty and non-null, got "+ref); - if (refLoc == null || refLoc.size() != ref.length) throw new IllegalArgumentException(" refLoc must be non-null and length must match ref bytes, got "+refLoc); - if (activeRegionWindow == null ) throw new IllegalArgumentException("activeRegionWindow must be non-null, got "+activeRegionWindow); - if (activeAllelesToGenotype == null ) throw new IllegalArgumentException("activeAllelesToGenotype must be non-null, got "+activeAllelesToGenotype); - if (genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser must be non-null, got "+genomeLocParser); - - // update the haplotypes so we're ready to call, getting the ordered list of positions on the reference - // that carry events among the haplotypes - final TreeSet startPosKeySet = decomposeHaplotypesIntoVariantContexts(haplotypes, haplotypeReadMap, ref, refLoc, activeAllelesToGenotype); - - // Walk along each position in the key set and create each event to be outputted - final Set calledHaplotypes = new HashSet<>(); - final List returnCalls = new ArrayList<>(); - final Map emptyDownSamplingMap = new DefaultHashMap<>(0.0); - - for( final int loc : startPosKeySet ) { - if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { // genotyping an event inside this active region - final List eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, activeAllelesToGenotype); - - if( eventsAtThisLoc.isEmpty() ) { continue; } - - // Create the event mapping object which maps the original haplotype events to the events present at just this locus - final Map> eventMapper = createEventMapper(loc, eventsAtThisLoc, haplotypes); - - // Sanity check the priority list for mistakes - final List priorityList = makePriorityList(eventsAtThisLoc); - - // Merge the event to find a common reference representation - final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false); - if( mergedVC == null ) { continue; } - - if( eventsAtThisLoc.size() != mergedVC.getAlternateAlleles().size() ) { - // this is possible in GGA mode when the same event is represented in multiple input records - throw new UserException("The same event (although possibly represented differently) is present in multiple input records at location " + loc + " and this is not something we can handle at this time. You will need to remove one of the records in order to proceed with your input file(s)."); - } - final Map mergeMap = new LinkedHashMap<>(); - mergeMap.put(null, mergedVC.getReference()); // the reference event (null) --> the reference allele - for(int iii = 0; iii < mergedVC.getAlternateAlleles().size(); iii++) { - mergeMap.put(eventsAtThisLoc.get(iii), mergedVC.getAlternateAllele(iii)); // BUGBUG: This is assuming that the order of alleles is the same as the priority list given to simpleMerge function - } - - final Map> alleleMapper = createAlleleMapper(mergeMap, eventMapper); - - if( DEBUG ) { - logger.info("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles()); - } - - final Map alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().getSampleContamination() ); - - final GenotypesContext genotypes = calculateGLsForThisEvent( alleleReadMap, mergedVC ); - final VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), mergedVC.isSNP() ? GenotypeLikelihoodsCalculationModel.Model.SNP : GenotypeLikelihoodsCalculationModel.Model.INDEL); - if( call != null ) { - final Map alleleReadMap_annotations = ( USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ? alleleReadMap : - convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, emptyDownSamplingMap ) ); - final Map stratifiedReadMap = filterToOnlyOverlappingReads( genomeLocParser, alleleReadMap_annotations, perSampleFilteredReadList, call ); - - VariantContext annotatedCall = annotationEngine.annotateContextForActiveRegion(tracker, stratifiedReadMap, call); - - if( call.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary! - annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall); - } - - // maintain the set of all called haplotypes - for ( final Allele calledAllele : call.getAlleles() ) - calledHaplotypes.addAll(alleleMapper.get(calledAllele)); - - returnCalls.add( annotatedCall ); - } - } - } - return new CalledHaplotypes(returnCalls, calledHaplotypes); - } - - /** - * Go through the haplotypes we assembled, and decompose them into their constituent variant contexts - * - * @param haplotypes the list of haplotypes we're working with - * @param haplotypeReadMap map from samples -> the per read allele likelihoods - * @param ref the reference bases (over the same interval as the haplotypes) - * @param refLoc the span of the reference bases - * @param activeAllelesToGenotype alleles we want to ensure are scheduled for genotyping (GGA mode) - * @return - */ - private TreeSet decomposeHaplotypesIntoVariantContexts(final List haplotypes, - final Map haplotypeReadMap, - final byte[] ref, - final GenomeLoc refLoc, - final List activeAllelesToGenotype) { - final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty(); - - // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file - final TreeSet startPosKeySet = EventMap.buildEventMapsForHaplotypes(haplotypes, ref, refLoc, DEBUG); - - if ( in_GGA_mode ) startPosKeySet.clear(); - - //cleanUpSymbolicUnassembledEvents( haplotypes ); // We don't make symbolic alleles so this isn't needed currently - if ( !in_GGA_mode ) { - // run the event merger if we're not in GGA mode - final boolean mergedAnything = crossHaplotypeEventMerger.merge(haplotypes, haplotypeReadMap, startPosKeySet, ref, refLoc); - if ( mergedAnything ) - cleanUpSymbolicUnassembledEvents( haplotypes ); // the newly created merged events could be overlapping the unassembled events - } - - if ( in_GGA_mode ) { - for( final VariantContext compVC : activeAllelesToGenotype ) { - startPosKeySet.add( compVC.getStart() ); - } - } - - return startPosKeySet; - } - - /** - * Get the priority list (just the list of sources for these variant context) used to merge overlapping events into common reference view - * @param vcs a list of variant contexts - * @return the list of the sources of vcs in the same order - */ - private List makePriorityList(final List vcs) { - final List priorityList = new LinkedList<>(); - for ( final VariantContext vc : vcs ) priorityList.add(vc.getSource()); - return priorityList; - } - - private List getVCsAtThisLocation(final List haplotypes, - final int loc, - final List activeAllelesToGenotype) { - // the overlapping events to merge into a common reference view - final List eventsAtThisLoc = new ArrayList<>(); - - if( activeAllelesToGenotype.isEmpty() ) { - for( final Haplotype h : haplotypes ) { - final EventMap eventMap = h.getEventMap(); - final VariantContext vc = eventMap.get(loc); - if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) { - eventsAtThisLoc.add(vc); - } - } - } else { // we are in GGA mode! - int compCount = 0; - for( final VariantContext compVC : activeAllelesToGenotype ) { - if( compVC.getStart() == loc ) { - int alleleCount = 0; - for( final Allele compAltAllele : compVC.getAlternateAlleles() ) { - List alleleSet = new ArrayList<>(2); - alleleSet.add(compVC.getReference()); - alleleSet.add(compAltAllele); - final String vcSourceName = "Comp" + compCount + "Allele" + alleleCount; - // check if this event is already in the list of events due to a repeat in the input alleles track - final VariantContext candidateEventToAdd = new VariantContextBuilder(compVC).alleles(alleleSet).source(vcSourceName).make(); - boolean alreadyExists = false; - for( final VariantContext eventToTest : eventsAtThisLoc ) { - if( eventToTest.hasSameAllelesAs(candidateEventToAdd) ) { - alreadyExists = true; - } - } - if( !alreadyExists ) { - eventsAtThisLoc.add(candidateEventToAdd); - } - alleleCount++; - } - } - compCount++; - } - } - - return eventsAtThisLoc; - } - - /** - * For a particular event described in inputVC, form PL vector for each sample by looking into allele read map and filling likelihood matrix for each allele - * @param alleleReadMap Allele map describing mapping from reads to alleles and corresponding likelihoods - * @param mergedVC Input VC with event to genotype - * @return GenotypesContext object wrapping genotype objects with PLs - */ - @Requires({"alleleReadMap!= null", "mergedVC != null"}) - @Ensures("result != null") - private GenotypesContext calculateGLsForThisEvent( final Map alleleReadMap, final VariantContext mergedVC ) { - final GenotypesContext genotypes = GenotypesContext.create(alleleReadMap.size()); - // Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample - for( final String sample : alleleReadMap.keySet() ) { - final int numHaplotypes = mergedVC.getAlleles().size(); - final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2]; - final double[][] haplotypeLikelihoodMatrix = PairHMMLikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleReadMap, mergedVC.getAlleles(), true); - int glIndex = 0; - for( int iii = 0; iii < numHaplotypes; iii++ ) { - for( int jjj = 0; jjj <= iii; jjj++ ) { - genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC - } - } - genotypes.add(new GenotypeBuilder(sample).alleles(noCall).PL(genotypeLikelihoods).make()); - } - return genotypes; - } - - private static Map filterToOnlyOverlappingReads( final GenomeLocParser parser, - final Map perSampleReadMap, - final Map> perSampleFilteredReadList, - final VariantContext call ) { - - final Map returnMap = new LinkedHashMap<>(); - final GenomeLoc callLoc = parser.createGenomeLoc(call); - for( final Map.Entry sample : perSampleReadMap.entrySet() ) { - final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap(); - - for( final Map.Entry> mapEntry : sample.getValue().getLikelihoodReadMap().entrySet() ) { - // only count the read if it overlaps the event, otherwise it is not added to the output read list at all - if( callLoc.overlapsP(parser.createGenomeLoc(mapEntry.getKey())) ) { // BUGBUG: This uses alignment start and stop, NOT soft start and soft end... - for( final Map.Entry alleleDoubleEntry : mapEntry.getValue().entrySet() ) { - likelihoodMap.add(mapEntry.getKey(), alleleDoubleEntry.getKey(), alleleDoubleEntry.getValue()); - } - } - } - - // add all filtered reads to the NO_CALL list because they weren't given any likelihoods - for( final GATKSAMRecord read : perSampleFilteredReadList.get(sample.getKey()) ) { - // only count the read if it overlaps the event, otherwise it is not added to the output read list at all - if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) { - for( final Allele allele : call.getAlleles() ) { - likelihoodMap.add(read, allele, 0.0); - } - } - } - - returnMap.put(sample.getKey(), likelihoodMap); - } - return returnMap; - } - - /** - * Removes symbolic events from list of haplotypes - * @param haplotypes Input/output list of haplotypes, before/after removal - */ - // TODO - split into input haplotypes and output haplotypes as not to share I/O arguments - @Requires("haplotypes != null") - protected static void cleanUpSymbolicUnassembledEvents( final List haplotypes ) { - final List haplotypesToRemove = new ArrayList<>(); - for( final Haplotype h : haplotypes ) { - for( final VariantContext vc : h.getEventMap().getVariantContexts() ) { - if( vc.isSymbolic() ) { - for( final Haplotype h2 : haplotypes ) { - for( final VariantContext vc2 : h2.getEventMap().getVariantContexts() ) { - if( vc.getStart() == vc2.getStart() && (vc2.isIndel() || vc2.isMNP()) ) { // unfortunately symbolic alleles can't currently be combined with non-point events - haplotypesToRemove.add(h); - break; - } - } - } - } - } - } - haplotypes.removeAll(haplotypesToRemove); - } - - // BUGBUG: ugh, too complicated - protected Map convertHaplotypeReadMapToAlleleReadMap( final Map haplotypeReadMap, - final Map> alleleMapper, - final Map perSampleDownsamplingFraction ) { - - final Map alleleReadMap = new LinkedHashMap<>(); - for( final Map.Entry haplotypeReadMapEntry : haplotypeReadMap.entrySet() ) { // for each sample - final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap(); - for( final Map.Entry> alleleMapperEntry : alleleMapper.entrySet() ) { // for each output allele - final List mappedHaplotypes = alleleMapperEntry.getValue(); - for( final Map.Entry> readEntry : haplotypeReadMapEntry.getValue().getLikelihoodReadMap().entrySet() ) { // for each read - double maxLikelihood = Double.NEGATIVE_INFINITY; - for( final Map.Entry alleleDoubleEntry : readEntry.getValue().entrySet() ) { // for each input allele - if( mappedHaplotypes.contains( new Haplotype(alleleDoubleEntry.getKey())) ) { // exact match of haplotype base string - maxLikelihood = Math.max( maxLikelihood, alleleDoubleEntry.getValue() ); - } - } - perReadAlleleLikelihoodMap.add(readEntry.getKey(), alleleMapperEntry.getKey(), maxLikelihood); - } - } - perReadAlleleLikelihoodMap.performPerAlleleDownsampling(perSampleDownsamplingFraction.get(haplotypeReadMapEntry.getKey())); // perform contamination downsampling - alleleReadMap.put(haplotypeReadMapEntry.getKey(), perReadAlleleLikelihoodMap); - } - - return alleleReadMap; - } - - protected static Map> createAlleleMapper( final Map mergeMap, final Map> eventMap ) { - final Map> alleleMapper = new LinkedHashMap<>(); - for( final Map.Entry entry : mergeMap.entrySet() ) { - alleleMapper.put(entry.getValue(), eventMap.get(new Event(entry.getKey()))); - } - return alleleMapper; - } - - @Requires({"haplotypes.size() >= eventsAtThisLoc.size() + 1"}) - @Ensures({"result.size() == eventsAtThisLoc.size() + 1"}) - protected static Map> createEventMapper( final int loc, final List eventsAtThisLoc, final List haplotypes ) { - - final Map> eventMapper = new LinkedHashMap<>(eventsAtThisLoc.size()+1); - final Event refEvent = new Event(null); - eventMapper.put(refEvent, new ArrayList()); - for( final VariantContext vc : eventsAtThisLoc ) { - eventMapper.put(new Event(vc), new ArrayList()); - } - - for( final Haplotype h : haplotypes ) { - if( h.getEventMap().get(loc) == null ) { - eventMapper.get(refEvent).add(h); - } else { - for( final VariantContext vcAtThisLoc : eventsAtThisLoc ) { - if( h.getEventMap().get(loc).hasSameAllelesAs(vcAtThisLoc) ) { - eventMapper.get(new Event(vcAtThisLoc)).add(h); - break; - } - } - } - } - - return eventMapper; - } - - @Ensures({"result.size() == haplotypeAllelesForSample.size()"}) - protected static List findEventAllelesInSample( final List eventAlleles, final List haplotypeAlleles, final List haplotypeAllelesForSample, final List> alleleMapper, final List haplotypes ) { - if( haplotypeAllelesForSample.contains(Allele.NO_CALL) ) { return noCall; } - final List eventAllelesForSample = new ArrayList<>(); - for( final Allele a : haplotypeAllelesForSample ) { - final Haplotype haplotype = haplotypes.get(haplotypeAlleles.indexOf(a)); - for( int iii = 0; iii < alleleMapper.size(); iii++ ) { - final List mappedHaplotypes = alleleMapper.get(iii); - if( mappedHaplotypes.contains(haplotype) ) { - eventAllelesForSample.add(eventAlleles.get(iii)); - break; - } - } - } - return eventAllelesForSample; - } - - @Deprecated - protected static Map generateVCsFromAlignment( final Haplotype haplotype, final byte[] ref, final GenomeLoc refLoc, final String sourceNameToAdd ) { - return new EventMap(haplotype, ref, refLoc, sourceNameToAdd); - } - - protected static boolean containsVCWithMatchingAlleles( final List list, final VariantContext vcToTest ) { - for( final VariantContext vc : list ) { - if( vc.hasSameAllelesAs(vcToTest) ) { - return true; - } - } - return false; - } - - protected static class Event { - public VariantContext vc; - - public Event( final VariantContext vc ) { - this.vc = vc; - } - - @Override - public boolean equals( final Object obj ) { - return obj instanceof Event && ((((Event) obj).vc == null && vc == null) || (((Event) obj).vc != null && vc != null && ((Event) obj).vc.hasSameAllelesAs(vc))) ; - } - - @Override - public int hashCode() { - return (vc == null ? -1 : vc.getAlleles().hashCode()); - } - } -} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java deleted file mode 100644 index b785b8d21..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ /dev/null @@ -1,1206 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; - -import com.google.java.contract.Ensures; -import net.sf.samtools.SAMFileWriter; -import org.broadinstitute.sting.commandline.*; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; -import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.downsampling.AlleleBiasedDownsamplingUtils; -import org.broadinstitute.sting.gatk.downsampling.DownsampleType; -import org.broadinstitute.sting.gatk.downsampling.DownsamplingUtils; -import org.broadinstitute.sting.gatk.filters.BadMateFilter; -import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; -import org.broadinstitute.sting.gatk.iterators.ReadTransformer; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; -import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; -import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext; -import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.ReadThreadingAssembler; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.activeregion.ActiveRegion; -import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState; -import org.broadinstitute.sting.utils.activeregion.ActivityProfileState; -import org.broadinstitute.sting.utils.clipping.ReadClipper; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.fragments.FragmentCollection; -import org.broadinstitute.sting.utils.fragments.FragmentUtils; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.sting.utils.gvcf.GVCFWriter; -import org.broadinstitute.sting.utils.haplotype.*; -import org.broadinstitute.sting.utils.haplotypeBAMWriter.HaplotypeBAMWriter; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.sting.utils.pairhmm.PairHMM; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.broadinstitute.sting.utils.variant.GATKVCFIndexType; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.variantcontext.*; -import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; -import org.broadinstitute.variant.vcf.*; - -import java.io.FileNotFoundException; -import java.io.PrintStream; -import java.util.*; - -/** - * Call SNPs and indels simultaneously via local de-novo assembly of haplotypes in an active region. Haplotypes are evaluated using an affine gap penalty Pair HMM. - * - *

Input

- *

- * Input bam file(s) from which to make calls - *

- * - *

Output

- *

- * VCF file with raw, unrecalibrated SNP and indel calls. - *

- * - *

Examples

- *
- *   java
- *     -jar GenomeAnalysisTK.jar
- *     -T HaplotypeCaller
- *     -R reference/human_g1k_v37.fasta
- *     -I sample1.bam [-I sample2.bam ...] \
- *     --dbsnp dbSNP.vcf \
- *     -stand_call_conf [50.0] \
- *     -stand_emit_conf 10.0 \
- *     [-L targets.interval_list]
- *     -o output.raw.snps.indels.vcf
- * 
- * - *

Caveats

- *
    - *
  • The system is under active and continuous development. All outputs, the underlying likelihood model, and command line arguments are likely to change often.
  • - *
- * - * @author rpoplin - * @since 8/22/11 - */ - -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} ) -@PartitionBy(PartitionType.LOCUS) -@BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN) -@ActiveRegionTraversalParameters(extension=100, maxRegion=300) -@ReadFilters({HCMappingQualityFilter.class}) -@Downsample(by= DownsampleType.BY_SAMPLE, toCoverage=250) -public class HaplotypeCaller extends ActiveRegionWalker, Integer> implements AnnotatorCompatible, NanoSchedulable { - // ----------------------------------------------------------------------------------------------- - // general haplotype caller arguments - // ----------------------------------------------------------------------------------------------- - - /** - * A raw, unfiltered, highly sensitive callset in VCF format. - */ - @Output(doc="File to which variants should be written") - protected VariantContextWriter vcfWriter = null; - - @Hidden - @Advanced - @Argument(fullName="likelihoodCalculationEngine",shortName="likelihoodEngine", - doc="what likelihood calculation engine to use to calculate the relative likelihood of reads vs haplotypes",required=false) - protected LikelihoodCalculationEngine.Implementation likelihoodEngineImplementation = LikelihoodCalculationEngine.Implementation.PairHMM; - - @Hidden - @Advanced - @Argument(fullName="heterogeneousKmerSizeResolution",shortName="hksr",doc="how to solve heterogeneous kmer situations using the fast method",required=false) - protected HeterogeneousKmerSizeResolution heterogeneousKmerSizeResultion = HeterogeneousKmerSizeResolution.COMBO_MIN; - - @Output(fullName="graphOutput", shortName="graph", doc="File to which debug assembly graph information should be written", required = false, defaultToStdout = false) - protected PrintStream graphWriter = null; - - /** - * The assembled haplotypes will be written as BAM to this file if requested. Really for debugging purposes only. - * Note that the output here does not include uninformative reads so that not every input read is emitted to the bam. - * - * Turning on this mode may result in serious performance cost for the HC. It's really only appropriate to - * use in specific areas where you want to better understand why the HC is making specific calls. - * - * The reads are written out containing a HC tag (integer) that encodes which haplotype each read best matches - * according to the haplotype caller's likelihood calculation. The use of this tag is primarily intended - * to allow good coloring of reads in IGV. Simply go to Color Alignments By > Tag and enter HC to more - * easily see which reads go with these haplotype. - * - * Note that the haplotypes (called or all, depending on mode) are emitted as single reads covering the entire - * active region, coming from read HC and a special read group. - * - * Note that only reads that are actually informative about the haplotypes are emitted. By informative we mean - * that there's a meaningful difference in the likelihood of the read coming from one haplotype compared to - * its next best haplotype. - * - * The best way to visualize the output of this mode is with IGV. Tell IGV to color the alignments by tag, - * and give it the HC tag, so you can see which reads support each haplotype. Finally, you can tell IGV - * to group by sample, which will separate the potential haplotypes from the reads. All of this can be seen - * in the following screenshot: https://www.dropbox.com/s/xvy7sbxpf13x5bp/haplotypecaller%20bamout%20for%20docs.png - * - */ - @Advanced - @Output(fullName="bamOutput", shortName="bamout", doc="File to which assembled haplotypes should be written", required = false, defaultToStdout = false) - protected StingSAMFileWriter bamWriter = null; - private HaplotypeBAMWriter haplotypeBAMWriter; - - /** - * The type of BAM output we want to see. - */ - @Advanced - @Argument(fullName="bamWriterType", shortName="bamWriterType", doc="How should haplotypes be written to the BAM?", required = false) - public HaplotypeBAMWriter.Type bamWriterType = HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES; - - /** - * rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate. - * dbSNP is not used in any way for the calculations themselves. - */ - @ArgumentCollection - protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); - private double log10GlobalReadMismappingRate; - - public RodBinding getDbsnpRodBinding() { return dbsnp.dbsnp; } - - /** - * If a call overlaps with a record from the provided comp track, the INFO field will be annotated - * as such in the output with the track name (e.g. -comp:FOO will have 'FOO' in the INFO field). - * Records that are filtered in the comp track will be ignored. - * Note that 'dbSNP' has been special-cased (see the --dbsnp argument). - */ - @Advanced - @Input(fullName="comp", shortName = "comp", doc="comparison VCF file", required=false) - public List> comps = Collections.emptyList(); - public List> getCompRodBindings() { return comps; } - - // The following are not used by the Unified Genotyper - public RodBinding getSnpEffRodBinding() { return null; } - public List> getResourceRodBindings() { return Collections.emptyList(); } - public boolean alwaysAppendDbsnpId() { return false; } - - /** - * Which annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available annotations. - */ - @Advanced - @Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false) - protected List annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"ClippingRankSumTest", "DepthPerSampleHC"})); - - /** - * Which annotations to exclude from output in the VCF file. Note that this argument has higher priority than the -A or -G arguments, - * so annotations will be excluded even if they are explicitly included with the other options. - */ - @Advanced - @Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false) - protected List annotationsToExclude = new ArrayList<>(Arrays.asList(new String[]{"SpanningDeletions", "TandemRepeatAnnotator"})); - - /** - * Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups. - */ - @Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false) - protected String[] annotationClassesToUse = { "Standard" }; - - @ArgumentCollection - private StandardCallerArgumentCollection SCAC = new StandardCallerArgumentCollection(); - - // ----------------------------------------------------------------------------------------------- - // arguments to control internal behavior of the read threading assembler - // ----------------------------------------------------------------------------------------------- - - @Advanced - @Argument(fullName="kmerSize", shortName="kmerSize", doc="Kmer size to use in the read threading assembler", required = false) - protected List kmerSizes = Arrays.asList(10, 25); - - @Advanced - @Argument(fullName="dontIncreaseKmerSizesForCycles", shortName="dontIncreaseKmerSizesForCycles", doc="Should we disable the iterating over kmer sizes when graph cycles are detected?", required = false) - protected boolean dontIncreaseKmerSizesForCycles = false; - - @Advanced - @Argument(fullName="numPruningSamples", shortName="numPruningSamples", doc="The number of samples that must pass the minPuning factor in order for the path to be kept", required = false) - protected int numPruningSamples = 1; - - @Hidden - @Argument(fullName="dontRecoverDanglingTails", shortName="dontRecoverDanglingTails", doc="Should we disable dangling tail recovery in the read threading assembler?", required = false) - protected boolean dontRecoverDanglingTails = false; - - // ----------------------------------------------------------------------------------------------- - // general advanced arguments to control haplotype caller behavior - // ----------------------------------------------------------------------------------------------- - - @Advanced - @Argument(fullName="emitRefConfidence", shortName="ERC", doc="Emit experimental reference confidence scores", required = false) - protected ReferenceConfidenceMode emitReferenceConfidence = ReferenceConfidenceMode.NONE; - - public enum ReferenceConfidenceMode { - NONE, - BP_RESOLUTION, - GVCF - } - - /** - * The GQ partition intervals - * - * Should be a non-empty list of boundaries. For example, suppose this variable is - * - * [A, B, C] - * - * We would partition our hom-ref sites into the following bands: - * - * X < A - * A <= X < B - * B <= X < C - * X >= C - * - * The default bands with (1, 10, 20, 30, 40, 50) give the following GQ blocks: - * - * [0, 0] - * (0, 10] - * (10, 20] - * (20, 30] - * (30, 40] - * (40, 50] - * (50, 99] - * - * Note that in the GATK GQ values are capped at 99. - */ - @Advanced - @Argument(fullName="GVCFGQBands", shortName="GQB", doc="Emit experimental reference confidence scores", required = false) - protected List GVCFGQBands = Arrays.asList(5, 20, 60); - - /** - * This parameter determines the maximum size of an indel considered as potentially segregating in the - * reference model. It is used to eliminate reads from being indel informative at a site, and determines - * by that mechanism the certainty in the reference base. Conceptually, setting this parameter to - * X means that each informative read is consistent with any indel of size < X being present at a specific - * position in the genome, given its alignment to the reference. - */ - @Advanced - @Argument(fullName="indelSizeToEliminateInRefModel", shortName="ERCIS", doc="The size of an indel to check for in the reference model", required = false) - protected int indelSizeToEliminateInRefModel = 10; - - // ----------------------------------------------------------------------------------------------- - // general advanced arguments to control haplotype caller behavior - // ----------------------------------------------------------------------------------------------- - - /** - * Users should be aware that this argument can really affect the results of the variant calling and should exercise caution. - * Using a prune factor of 1 (or below) will prevent any pruning from the graph which is generally not ideal; it can make the - * calling much slower and even less accurate (because it can prevent effective merging of "tails" in the graph). Higher values - * tend to make the calling much faster, but also lowers the sensitivity of the results (because it ultimately requires higher - * depth to produce calls). - */ - @Advanced - @Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with < X supporting kmers are pruned from the graph", required = false) - protected int MIN_PRUNE_FACTOR = 2; - - @Advanced - @Argument(fullName="gcpHMM", shortName="gcpHMM", doc="Flat gap continuation penalty for use in the Pair HMM", required = false) - protected int gcpHMM = 10; - - /** - * If this flag is provided, the haplotype caller will include unmapped reads in the assembly and calling - * when these reads occur in the region being analyzed. Typically, for paired end analyses, one pair of the - * read can map, but if its pair is too divergent then it may be unmapped and placed next to its mate, taking - * the mates contig and alignment start. If this flag is provided the haplotype caller will see such reads, - * and may make use of them in assembly and calling, where possible. - */ - @Hidden - @Argument(fullName="includeUmappedReads", shortName="unmapped", doc="If provided, unmapped reads with chromosomal coordinates (i.e., those placed to their maps) will be included in the assembly and calling", required = false) - protected boolean includeUnmappedReads = false; - - @Advanced - @Argument(fullName="useAllelesTrigger", shortName="allelesTrigger", doc = "If specified, use additional trigger on variants found in an external alleles file", required=false) - protected boolean USE_ALLELES_TRIGGER = false; - - @Advanced - @Argument(fullName="useFilteredReadsForAnnotations", shortName="useFilteredReadsForAnnotations", doc = "If specified, use the contamination-filtered read maps for the purposes of annotating variants", required=false) - protected boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS = false; - - /** - * The phredScaledGlobalReadMismappingRate reflects the average global mismapping rate of all reads, regardless of their - * mapping quality. This term effects the probability that a read originated from the reference haplotype, regardless of - * its edit distance from the reference, in that the read could have originated from the reference haplotype but - * from another location in the genome. Suppose a read has many mismatches from the reference, say like 5, but - * has a very high mapping quality of 60. Without this parameter, the read would contribute 5 * Q30 evidence - * in favor of its 5 mismatch haplotype compared to reference, potentially enough to make a call off that single - * read for all of these events. With this parameter set to Q30, though, the maximum evidence against the reference - * that this (and any) read could contribute against reference is Q30. - * - * Set this term to any negative number to turn off the global mapping rate - */ - @Advanced - @Argument(fullName="phredScaledGlobalReadMismappingRate", shortName="globalMAPQ", doc="The global assumed mismapping rate for reads", required = false) - protected int phredScaledGlobalReadMismappingRate = 45; - - /** - * Assembly graph can be quite complex, and could imply a very large number of possible haplotypes. Each haplotype - * considered requires N PairHMM evaluations if there are N reads across all samples. In order to control the - * run of the haplotype caller we only take maxNumHaplotypesInPopulation paths from the graph, in order of their - * weights, no matter how many paths are possible to generate from the graph. Putting this number too low - * will result in dropping true variation because paths that include the real variant are not even considered. - */ - @Advanced - @Argument(fullName="maxNumHaplotypesInPopulation", shortName="maxNumHaplotypesInPopulation", doc="Maximum number of haplotypes to consider for your population. This number will probably need to be increased when calling organisms with high heterozygosity.", required = false) - protected int maxNumHaplotypesInPopulation = 128; - - @Advanced - @Argument(fullName="mergeVariantsViaLD", shortName="mergeVariantsViaLD", doc="If specified, we will merge variants together into block substitutions that are in strong local LD", required = false) - protected boolean mergeVariantsViaLD = false; - - // ----------------------------------------------------------------------------------------------- - // arguments for debugging / developing the haplotype caller - // ----------------------------------------------------------------------------------------------- - /** - * The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime. - */ - @Hidden - @Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for genotype likelihood calculations", required = false) - public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING; - - @Hidden - @Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false) - protected String keepRG = null; - - @Hidden - @Argument(fullName="justDetermineActiveRegions", shortName="justDetermineActiveRegions", doc = "If specified, the HC won't actually do any assembly or calling, it'll just run the upfront active region determination code. Useful for benchmarking and scalability testing", required=false) - protected boolean justDetermineActiveRegions = false; - - @Hidden - @Argument(fullName="dontGenotype", shortName="dontGenotype", doc = "If specified, the HC will do any assembly but won't do calling. Useful for benchmarking and scalability testing", required=false) - protected boolean dontGenotype = false; - - @Hidden - @Argument(fullName="errorCorrectKmers", shortName="errorCorrectKmers", doc = "Use an exploratory algorithm to error correct the kmers used during assembly. May cause fundamental problems with the assembly graph itself", required=false) - protected boolean errorCorrectKmers = false; - - @Advanced - @Argument(fullName="debug", shortName="debug", doc="If specified, print out very verbose debug information about each triggering active region", required = false) - protected boolean DEBUG; - - @Hidden - @Argument(fullName="debugGraphTransformations", shortName="debugGraphTransformations", doc="If specified, we will write DOT formatted graph files out of the assembler for only this graph size", required = false) - protected boolean debugGraphTransformations = false; - - @Hidden // TODO -- not currently useful - @Argument(fullName="useLowQualityBasesForAssembly", shortName="useLowQualityBasesForAssembly", doc="If specified, we will include low quality bases when doing the assembly", required = false) - protected boolean useLowQualityBasesForAssembly = false; - - @Hidden - @Argument(fullName="dontTrimActiveRegions", shortName="dontTrimActiveRegions", doc="If specified, we will not trim down the active region from the full region (active + extension) to just the active interval for genotyping", required = false) - protected boolean dontTrimActiveRegions = false; - - @Hidden - @Argument(fullName="dontUseSoftClippedBases", shortName="dontUseSoftClippedBases", doc="If specified, we will not analyze soft clipped bases in the reads", required = false) - protected boolean dontUseSoftClippedBases = false; - - @Hidden - @Argument(fullName="captureAssemblyFailureBAM", shortName="captureAssemblyFailureBAM", doc="If specified, we will write a BAM called assemblyFailure.bam capturing all of the reads that were in the active region when the assembler failed for any reason", required = false) - protected boolean captureAssemblyFailureBAM = false; - - @Hidden - @Argument(fullName="allowCyclesInKmerGraphToGeneratePaths", shortName="allowCyclesInKmerGraphToGeneratePaths", doc="If specified, we will allow cycles in the kmer graphs to generate paths with multiple copies of the path sequenece rather than just the shortest paths", required = false) - protected boolean allowCyclesInKmerGraphToGeneratePaths = false; - - @Hidden - @Argument(fullName="noFpga", shortName="noFpga", doc="If provided, disables the use of the FPGA HMM implementation", required = false) - protected boolean noFpga = false; - - // Parameters to control read error correction - @Hidden - @Argument(fullName="errorCorrectReads", shortName="errorCorrectReads", doc = "Use an exploratory algorithm to error correct the kmers used during assembly. May cause fundamental problems with the assembly graph itself", required=false) - protected boolean errorCorrectReads = false; - - @Hidden - @Argument(fullName="kmerLengthForReadErrorCorrection", shortName="kmerLengthForReadErrorCorrection", doc = "Use an exploratory algorithm to error correct the kmers used during assembly. May cause fundamental problems with the assembly graph itself", required=false) - protected int kmerLengthForReadErrorCorrection = 25; - - @Hidden - @Argument(fullName="minObservationsForKmerToBeSolid", shortName="minObservationsForKmerToBeSolid", doc = "A k-mer must be seen at least these times for it considered to be solid", required=false) - protected int minObservationsForKmerToBeSolid = 20; - - /** - * the maximum extent into the full active region extension that we're willing to go in genotyping our events - */ - @Hidden - @Argument(fullName="maxDiscARExtension", shortName="maxDiscARExtension", doc = "the maximum extent into the full active region extension that we're willing to go in genotyping our events for discovery", required=false) - protected int MAX_DISCOVERY_ACTIVE_REGION_EXTENSION = 25; - - @Hidden - @Argument(fullName="maxGGAARExtension", shortName="maxGGAARExtension", doc = "the maximum extent into the full active region extension that we're willing to go in genotyping our events for GGA mode", required=false) - protected int MAX_GGA_ACTIVE_REGION_EXTENSION = 300; - - /** - * Include at least this many bases around an event for calling it - */ - @Hidden - @Argument(fullName="paddingAroundIndels", shortName="paddingAroundIndels", doc = "Include at least this many bases around an event for calling indels", required=false) - protected int PADDING_AROUND_OTHERS_FOR_CALLING = 150; - - @Hidden - @Argument(fullName="paddingAroundSNPs", shortName="paddingAroundSNPs", doc = "Include at least this many bases around an event for calling snps", required=false) - protected int PADDING_AROUND_SNPS_FOR_CALLING = 20; - - /** - * Which PCR indel error model should we use when calculating likelihoods? If NONE is selected, then the default base - * insertion/deletion qualities will be used (or taken from the read if generated through the BaseRecalibrator). - * VERY IMPORTANT: when using PCR-free sequencing data we definitely recommend setting this argument to NONE. - */ - @Advanced - @Argument(fullName = "pcr_indel_model", shortName = "pcrModel", doc = "The PCR indel model to use", required = false) - public PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL pcrErrorModel = PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.CONSERVATIVE; - - // ----------------------------------------------------------------------------------------------- - // done with Haplotype caller parameters - // ----------------------------------------------------------------------------------------------- - - // the UG engines - private UnifiedGenotyperEngine UG_engine = null; - private UnifiedGenotyperEngine UG_engine_simple_genotyper = null; - - // the assembly engine - private LocalAssemblyEngine assemblyEngine = null; - - // the likelihoods engine - private LikelihoodCalculationEngine likelihoodCalculationEngine = null; - - // the genotyping engine - private GenotypingEngine genotypingEngine = null; - - // fasta reference reader to supplement the edges of the reference sequence - protected CachingIndexedFastaSequenceFile referenceReader; - - // reference base padding size - private static final int REFERENCE_PADDING = 500; - - private ActiveRegionTrimmer trimmer = null; - - private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument - private final static int minReadsPerAlignmentStart = 5; // TODO -- should be an argument - - // bases with quality less than or equal to this value are trimmed off the tails of the reads - private static final byte MIN_TAIL_QUALITY = 20; - - private static final byte MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION = 6; - // the minimum length of a read we'd consider using for genotyping - private final static int MIN_READ_LENGTH = 10; - - private List samplesList = new ArrayList<>(); - - private final static Allele FAKE_REF_ALLELE = Allele.create("N", true); // used in isActive function to call into UG Engine. Should never appear anywhere in a VCF file - private final static Allele FAKE_ALT_ALLELE = Allele.create("", false); // used in isActive function to call into UG Engine. Should never appear anywhere in a VCF file - - ReferenceConfidenceModel referenceConfidenceModel = null; - - // as determined experimentally Nov-Dec 2013 - protected final static GATKVCFIndexType OPTIMAL_GVCF_INDEX_TYPE = GATKVCFIndexType.LINEAR; - protected final static int OPTIMAL_GVCF_INDEX_PARAMETER = 128000; - - //--------------------------------------------------------------------------------------------------------------- - // - // initialize - // - //--------------------------------------------------------------------------------------------------------------- - - public void initialize() { - super.initialize(); - - if ( SCAC.AFmodel == AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY ) - throw new UserException.BadArgumentValue("pnrm", "HaplotypeCaller doesn't currently support " + SCAC.AFmodel); - - // get all of the unique sample names - Set samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); - samplesList.addAll( samples ); - // initialize the UnifiedGenotyper Engine which is used to call into the exact model - final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user - // HC GGA mode depends critically on EMIT_ALL_SITES being set for the UG engine - UAC.OutputMode = SCAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES - ? UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES : UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; - UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY); - - // create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested - UnifiedArgumentCollection simpleUAC = new UnifiedArgumentCollection(UAC); - simpleUAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; - simpleUAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; - simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); // low values used for isActive determination only, default/user-specified values used for actual calling - simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); // low values used for isActive determination only, default/user-specified values used for actual calling - simpleUAC.CONTAMINATION_FRACTION = 0.0; - simpleUAC.CONTAMINATION_FRACTION_FILE = null; - simpleUAC.exactCallsLog = null; - UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY); - - if( UAC.CONTAMINATION_FRACTION_FILE != null ) { - UAC.setSampleContamination(AlleleBiasedDownsamplingUtils.loadContaminationFile(UAC.CONTAMINATION_FRACTION_FILE, UAC.CONTAMINATION_FRACTION, samples, logger)); - } - - // initialize the output VCF header - final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); - - Set headerInfo = new HashSet<>(); - - // all annotation fields from VariantAnnotatorEngine - headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions()); - // all callers need to add these standard annotation header lines - VCFStandardHeaderLines.addStandardInfoLines(headerInfo, true, - VCFConstants.DOWNSAMPLED_KEY, - VCFConstants.MLE_ALLELE_COUNT_KEY, - VCFConstants.MLE_ALLELE_FREQUENCY_KEY); - // all callers need to add these standard FORMAT field header lines - VCFStandardHeaderLines.addStandardFormatLines(headerInfo, true, - VCFConstants.GENOTYPE_KEY, - VCFConstants.GENOTYPE_QUALITY_KEY, - VCFConstants.DEPTH_KEY, - VCFConstants.GENOTYPE_PL_KEY); - - // FILTER fields are added unconditionally as it's not always 100% certain the circumstances - // where the filters are used. For example, in emitting all sites the lowQual field is used - headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotyperEngine.LOW_QUAL_FILTER_NAME, "Low quality")); - - referenceConfidenceModel = new ReferenceConfidenceModel(getToolkit().getGenomeLocParser(), samples, getToolkit().getSAMFileHeader(), indelSizeToEliminateInRefModel); - if ( emitReferenceConfidence() ) { - if ( samples.size() != 1 ) throw new UserException.BadArgumentValue("emitRefConfidence", "Can only be used in single sample mode currently"); - headerInfo.addAll(referenceConfidenceModel.getVCFHeaderLines()); - if ( emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) { - // a kluge to enforce the use of this indexing strategy - if (getToolkit().getArguments().variant_index_type != OPTIMAL_GVCF_INDEX_TYPE || - getToolkit().getArguments().variant_index_parameter != OPTIMAL_GVCF_INDEX_PARAMETER) { - throw new UserException.GVCFIndexException(OPTIMAL_GVCF_INDEX_TYPE, OPTIMAL_GVCF_INDEX_PARAMETER); - } - - try { - vcfWriter = new GVCFWriter(vcfWriter, GVCFGQBands); - } catch ( IllegalArgumentException e ) { - throw new UserException.BadArgumentValue("GQBands", "are malformed: " + e.getMessage()); - } - } - } - - vcfWriter.writeHeader(new VCFHeader(headerInfo, samples)); - - try { - // fasta reference reader to supplement the edges of the reference sequence - referenceReader = new CachingIndexedFastaSequenceFile(getToolkit().getArguments().referenceFile); - } catch( FileNotFoundException e ) { - throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e); - } - - // create and setup the assembler - assemblyEngine = new ReadThreadingAssembler(maxNumHaplotypesInPopulation, kmerSizes, dontIncreaseKmerSizesForCycles, numPruningSamples); - - assemblyEngine.setErrorCorrectKmers(errorCorrectKmers); - assemblyEngine.setPruneFactor(MIN_PRUNE_FACTOR); - assemblyEngine.setDebug(DEBUG); - assemblyEngine.setDebugGraphTransformations(debugGraphTransformations); - assemblyEngine.setAllowCyclesInKmerGraphToGeneratePaths(allowCyclesInKmerGraphToGeneratePaths); - assemblyEngine.setRecoverDanglingTails(!dontRecoverDanglingTails); - - if ( graphWriter != null ) assemblyEngine.setGraphWriter(graphWriter); - if ( useLowQualityBasesForAssembly ) assemblyEngine.setMinBaseQualityToUseInAssembly((byte)1); - - // setup the likelihood calculation engine - if ( phredScaledGlobalReadMismappingRate < 0 ) phredScaledGlobalReadMismappingRate = -1; - - // configure the global mismapping rate - if ( phredScaledGlobalReadMismappingRate < 0 ) { - log10GlobalReadMismappingRate = - Double.MAX_VALUE; - } else { - log10GlobalReadMismappingRate = QualityUtils.qualToErrorProbLog10(phredScaledGlobalReadMismappingRate); - logger.info("Using global mismapping rate of " + phredScaledGlobalReadMismappingRate + " => " + log10GlobalReadMismappingRate + " in log10 likelihood units"); - } - - // create our likelihood calculation engine - likelihoodCalculationEngine = createLikelihoodCalculationEngine(); - - final MergeVariantsAcrossHaplotypes variantMerger = mergeVariantsViaLD ? new LDMerger(DEBUG, 10, 1) : new MergeVariantsAcrossHaplotypes(); - - genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS, variantMerger ); - - if ( bamWriter != null ) { - // we currently do not support multi-threaded BAM writing, so exception out - if ( getToolkit().getTotalNumberOfThreads() > 1 ) - throw new UserException.BadArgumentValue("bamout", "Currently cannot emit a BAM file from the HaplotypeCaller in multi-threaded mode."); - haplotypeBAMWriter = HaplotypeBAMWriter.create(bamWriterType, bamWriter, getToolkit().getSAMFileHeader()); - } - - trimmer = new ActiveRegionTrimmer(DEBUG, PADDING_AROUND_SNPS_FOR_CALLING, PADDING_AROUND_OTHERS_FOR_CALLING, - UAC.GenotypingMode.equals(GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) ? MAX_GGA_ACTIVE_REGION_EXTENSION : MAX_DISCOVERY_ACTIVE_REGION_EXTENSION, - getToolkit().getGenomeLocParser()); - } - - /** - * Instantiates the appropriate likelihood calculation engine. - * - * @return never {@code null}. - */ - private LikelihoodCalculationEngine createLikelihoodCalculationEngine() { - switch (likelihoodEngineImplementation) { - case PairHMM: - return new PairHMMLikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM, log10GlobalReadMismappingRate, noFpga, pcrErrorModel ); - case GraphBased: - return new GraphBasedLikelihoodCalculationEngine( (byte)gcpHMM,log10GlobalReadMismappingRate,heterogeneousKmerSizeResultion,DEBUG,debugGraphTransformations); - case Random: - return new RandomLikelihoodCalculationEngine(); - default: - //Note: we do not include in the error message list as it is of no grand public interest. - throw new UserException("Unsupported likelihood calculation engine '" + likelihoodCalculationEngine + - "'. Please use one of the following instead: 'PairHMM' and 'GraphBased'."); - } - } - - //--------------------------------------------------------------------------------------------------------------- - // - // isActive - // - //--------------------------------------------------------------------------------------------------------------- - - // enable deletions in the pileup - @Override - public boolean includeReadsWithDeletionAtLoci() { return true; } - - // enable non primary and extended reads in the active region - @Override - public EnumSet desiredReadStates() { - if ( includeUnmappedReads ) { - throw new UserException.BadArgumentValue("includeUnmappedReads", "is not yet functional"); -// return EnumSet.of( -// ActiveRegionReadState.PRIMARY, -// ActiveRegionReadState.NONPRIMARY, -// ActiveRegionReadState.EXTENDED, -// ActiveRegionReadState.UNMAPPED -// ); - } else - return EnumSet.of( - ActiveRegionReadState.PRIMARY, - ActiveRegionReadState.NONPRIMARY, - ActiveRegionReadState.EXTENDED - ); - } - - @Override - @Ensures({"result.isActiveProb >= 0.0", "result.isActiveProb <= 1.0"}) - public ActivityProfileState isActive( final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context ) { - - if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { - final VariantContext vcFromAllelesRod = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), false, logger, UG_engine.getUAC().alleles); - if( vcFromAllelesRod != null ) { - return new ActivityProfileState(ref.getLocus(), 1.0); - } - } - - if( USE_ALLELES_TRIGGER ) { - return new ActivityProfileState( ref.getLocus(), tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ? 1.0 : 0.0 ); - } - - if( context == null || context.getBasePileup().isEmpty() ) - // if we don't have any data, just abort early - return new ActivityProfileState(ref.getLocus(), 0.0); - - final List noCall = Collections.singletonList(Allele.NO_CALL); // used to noCall all genotypes until the exact model is applied - final Map splitContexts = AlignmentContextUtils.splitContextBySampleName(context); - final GenotypesContext genotypes = GenotypesContext.create(splitContexts.keySet().size()); - final MathUtils.RunningAverage averageHQSoftClips = new MathUtils.RunningAverage(); - for( final Map.Entry sample : splitContexts.entrySet() ) { - final double[] genotypeLikelihoods = referenceConfidenceModel.calcGenotypeLikelihoodsOfRefVsAny(sample.getValue().getBasePileup(), ref.getBase(), (byte) 18, averageHQSoftClips).genotypeLikelihoods; - genotypes.add( new GenotypeBuilder(sample.getKey()).alleles(noCall).PL(genotypeLikelihoods).make() ); - } - - final List alleles = Arrays.asList(FAKE_REF_ALLELE , FAKE_ALT_ALLELE); - final VariantCallContext vcOut = UG_engine_simple_genotyper.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getStop(), alleles).genotypes(genotypes).make(), GenotypeLikelihoodsCalculationModel.Model.INDEL); - final double isActiveProb = vcOut == null ? 0.0 : QualityUtils.qualToProb( vcOut.getPhredScaledQual() ); - - return new ActivityProfileState( ref.getLocus(), isActiveProb, averageHQSoftClips.mean() > 6.0 ? ActivityProfileState.Type.HIGH_QUALITY_SOFT_CLIPS : ActivityProfileState.Type.NONE, averageHQSoftClips.mean() ); - } - - //--------------------------------------------------------------------------------------------------------------- - // - // map - // - //--------------------------------------------------------------------------------------------------------------- - - private final static List NO_CALLS = Collections.emptyList(); - @Override - public List map( final ActiveRegion originalActiveRegion, final RefMetaDataTracker metaDataTracker ) { - if ( justDetermineActiveRegions ) - // we're benchmarking ART and/or the active region determination code in the HC, just leave without doing any work - return NO_CALLS; - - if( !originalActiveRegion.isActive() ) { - // Not active so nothing to do! - return referenceModelForNoVariation(originalActiveRegion, true); - } - - final List activeAllelesToGenotype = new ArrayList<>(); - if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { - for ( final VariantContext vc : metaDataTracker.getValues(UG_engine.getUAC().alleles) ) { - if ( vc.isNotFiltered() ) { - activeAllelesToGenotype.add(vc); // do something with these VCs during GGA mode - } - } - // No alleles found in this region so nothing to do! - if ( activeAllelesToGenotype.isEmpty() ) { return referenceModelForNoVariation(originalActiveRegion, true); } - } else { - // No reads here so nothing to do! - if( originalActiveRegion.size() == 0 ) { return referenceModelForNoVariation(originalActiveRegion, true); } - } - - // run the local assembler, getting back a collection of information on how we should proceed - final AssemblyResultSet assemblyResult = assembleReads(originalActiveRegion, activeAllelesToGenotype); - final ActiveRegion regionForGenotyping = assemblyResult.getRegionForGenotyping(); - - // abort early if something is out of the acceptable range - if( ! assemblyResult.isVariationPresent() ) { - return referenceModelForNoVariation(originalActiveRegion, false); - } // only the reference haplotype remains so nothing else to do! - - if (dontGenotype) return NO_CALLS; // user requested we not proceed - - // filter out reads from genotyping which fail mapping quality based criteria - final Collection filteredReads = filterNonPassingReads( regionForGenotyping ); - final Map> perSampleFilteredReadList = splitReadsBySample( filteredReads ); - - if( regionForGenotyping.size() == 0 ) { - // no reads remain after filtering so nothing else to do! - return referenceModelForNoVariation(originalActiveRegion, false); - } - - // evaluate each sample's reads against all haplotypes - //logger.info("Computing read likelihoods with " + assemblyResult.regionForGenotyping.size() + " reads"); - final List haplotypes = assemblyResult.getHaplotypeList(); - final Map> reads = splitReadsBySample( regionForGenotyping.getReads() ); - - // Calculate the likelihoods: CPU intesive part. - final Map stratifiedReadMap = - likelihoodCalculationEngine.computeReadLikelihoods(assemblyResult,reads); - - - - - - // Note: we used to subset down at this point to only the "best" haplotypes in all samples for genotyping, but there - // was a bad interaction between that selection and the marginalization that happens over each event when computing - // GLs. In particular, for samples that are heterozygous non-reference (B/C) the marginalization for B treats the - // haplotype containing C as reference (and vice versa). Now this is fine if all possible haplotypes are included - // in the genotyping, but we lose information if we select down to a few haplotypes. [EB] - - final GenotypingEngine.CalledHaplotypes calledHaplotypes = genotypingEngine.assignGenotypeLikelihoods( UG_engine, - haplotypes, - stratifiedReadMap, - perSampleFilteredReadList, - assemblyResult.getFullReferenceWithPadding(), - assemblyResult.getPaddedReferenceLoc(), - regionForGenotyping.getLocation(), - getToolkit().getGenomeLocParser(), - metaDataTracker, - activeAllelesToGenotype ); - - // TODO -- must disable if we are doing NCT, or set the output type of ! presorted - if ( bamWriter != null ) { - haplotypeBAMWriter.writeReadsAlignedToHaplotypes( - haplotypes, - assemblyResult.getPaddedReferenceLoc(), - haplotypes, - calledHaplotypes.getCalledHaplotypes(), - stratifiedReadMap); - } - - if( DEBUG ) { logger.info("----------------------------------------------------------------------------------"); } - - if ( emitReferenceConfidence() ) { - if ( calledHaplotypes.getCalls().isEmpty() ) { - // no called all of the potential haplotypes - return referenceModelForNoVariation(originalActiveRegion, false); - } else - return referenceConfidenceModel.calculateRefConfidence(assemblyResult.getReferenceHaplotype(), - calledHaplotypes.getCalledHaplotypes(), assemblyResult.getPaddedReferenceLoc(), regionForGenotyping, - stratifiedReadMap, calledHaplotypes.getCalls()); - } else { - return calledHaplotypes.getCalls(); - } - } - - /** - * High-level function that runs the assembler on the active region reads, - * returning a data structure with the resulting information needed - * for further HC steps - * - * @param activeRegion the region we should assemble - * @param activeAllelesToGenotype additional alleles we might need to genotype (can be empty) - * @return the AssemblyResult describing how to proceed with genotyping - */ - protected AssemblyResultSet assembleReads(final ActiveRegion activeRegion, final List activeAllelesToGenotype) { - // Create the reference haplotype which is the bases from the reference that make up the active region - finalizeActiveRegion(activeRegion); // handle overlapping fragments, clip adapter and low qual tails - - final byte[] fullReferenceWithPadding = activeRegion.getActiveRegionReference(referenceReader, REFERENCE_PADDING); - final GenomeLoc paddedReferenceLoc = getPaddedLoc(activeRegion); - final Haplotype referenceHaplotype = createReferenceHaplotype(activeRegion, paddedReferenceLoc); - - // Create ReadErrorCorrector object if requested - will be used within assembly engine. - ReadErrorCorrector readErrorCorrector = null; - if (errorCorrectReads) - readErrorCorrector = new ReadErrorCorrector(kmerLengthForReadErrorCorrection, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION, minObservationsForKmerToBeSolid, DEBUG, fullReferenceWithPadding); - - try { - final AssemblyResultSet assemblyResultSet = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype,readErrorCorrector ); - assemblyResultSet.debugDump(logger); - - if ( ! dontTrimActiveRegions ) { - final ActiveRegion trimmedActiveRegion = trimActiveRegion(assemblyResultSet,activeAllelesToGenotype); - if (trimmedActiveRegion != null) - return trimAssemblyResultSet(assemblyResultSet, trimmedActiveRegion); - else { - assemblyResultSet.resetVariationPresent(); - return assemblyResultSet; - } - } else - return assemblyResultSet; - } catch ( final Exception e ) { - // Capture any exception that might be thrown, and write out the assembly failure BAM if requested - if ( captureAssemblyFailureBAM ) { - final SAMFileWriter writer = ReadUtils.createSAMFileWriterWithCompression(getToolkit().getSAMFileHeader(), true, "assemblyFailure.bam", 5); - for ( final GATKSAMRecord read : activeRegion.getReads() ) { - writer.addAlignment(read); - } - writer.close(); - } - throw e; - } - } - - /** - * Helper function to create the reference haplotype out of the active region and a padded loc - * @param activeRegion the active region from which to generate the reference haplotype - * @param paddedReferenceLoc the GenomeLoc which includes padding and shows how big the reference haplotype should be - * @return a non-null haplotype - */ - private Haplotype createReferenceHaplotype(final ActiveRegion activeRegion, final GenomeLoc paddedReferenceLoc) { - return ReferenceConfidenceModel.createReferenceHaplotype(activeRegion, activeRegion.getActiveRegionReference(referenceReader), paddedReferenceLoc); - } - - /** - * Create an ref model result (ref model or no calls depending on mode) for an active region without any variation - * (not is active, or assembled to just ref) - * - * @param region the region to return a no-variation result - * @param needsToBeFinalized should the region be finalized before computing the ref model (should be false if already done) - * @return a list of variant contexts (can be empty) to emit for this ref region - */ - private List referenceModelForNoVariation(final ActiveRegion region, final boolean needsToBeFinalized) { - if ( emitReferenceConfidence() ) { - if ( needsToBeFinalized ) finalizeActiveRegion(region); - filterNonPassingReads(region); // TODO -- remove when filtering is done in finalizeActiveRegion - final GenomeLoc paddedLoc = region.getExtendedLoc(); - final Haplotype refHaplotype = createReferenceHaplotype(region, paddedLoc); - final List haplotypes = Collections.singletonList(refHaplotype); - return referenceConfidenceModel.calculateRefConfidence(refHaplotype, haplotypes, - paddedLoc, region, createDummyStratifiedReadMap(refHaplotype, samplesList, region), - Collections.emptyList()); - } else { - return NO_CALLS; - } - } - - /** - * Create a context that maps each read to the reference haplotype with log10 L of 0 - * @param refHaplotype a non-null reference haplotype - * @param samples a list of all samples - * @param region the active region containing reads - * @return a map from sample -> PerReadAlleleLikelihoodMap that maps each read to ref - */ - public static Map createDummyStratifiedReadMap(final Haplotype refHaplotype, - final List samples, - final ActiveRegion region) { - final Allele refAllele = Allele.create(refHaplotype, true); - - final Map map = new LinkedHashMap<>(1); - for ( final Map.Entry> entry : splitReadsBySample(samples, region.getReads()).entrySet() ) { - final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap(); - for ( final GATKSAMRecord read : entry.getValue() ) { - likelihoodMap.add(read, refAllele, 0.0); - } - map.put(entry.getKey(), likelihoodMap); - } - - return map; - } - - private ActiveRegion trimActiveRegion(final AssemblyResultSet resultSet, final Collection activeAllelesToGenotype) { - if ( DEBUG ) logger.info("Trimming active region " + resultSet.getRegionForGenotyping() + " with " + resultSet.getHaplotypeCount() + " haplotypes"); - final List haplotypeList = resultSet.getHaplotypeList(); - final ActiveRegion originalGenotypingRegion = resultSet.getRegionForGenotyping(); - EventMap.buildEventMapsForHaplotypes(haplotypeList, resultSet.getFullReferenceWithPadding(), resultSet.getPaddedReferenceLoc(), DEBUG); - final TreeSet allVariantsWithinFullActiveRegion = EventMap.getAllVariantContexts(haplotypeList); - allVariantsWithinFullActiveRegion.addAll(activeAllelesToGenotype); - - final ActiveRegion trimmedActiveRegion = trimmer.trimRegion(originalGenotypingRegion, allVariantsWithinFullActiveRegion,false); - if ( trimmedActiveRegion == null ) { - // there were no variants found within the active region itself, so just return null - if ( DEBUG ) logger.info("No variation found within the active region, skipping the region :-)"); - return null; - } - - // trim down the reads and add them to the trimmed active region - final List trimmedReads = new ArrayList<>(originalGenotypingRegion.getReads().size()); - for( final GATKSAMRecord read : originalGenotypingRegion.getReads() ) { - final GATKSAMRecord clippedRead = ReadClipper.hardClipToRegion( read, - trimmedActiveRegion.getExtendedLoc().getStart(), trimmedActiveRegion.getExtendedLoc().getStop() ); - if( trimmedActiveRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) - trimmedReads.add(clippedRead); - } - trimmedActiveRegion.clearReads(); - trimmedActiveRegion.addAll(ReadUtils.sortReadsByCoordinate(trimmedReads)); - - return trimmedActiveRegion; - } - - - /** - * Trims a assembly result set according to the active-region trimming. - * - * @param resultSet the original assembly result set. - * @param trimmedActiveRegion the trimmed active region to trim to. - * @return the assembly result set trimmed. - */ - private AssemblyResultSet trimAssemblyResultSet(final AssemblyResultSet resultSet, final ActiveRegion trimmedActiveRegion) { - if ( DEBUG ) logger.info("Trimming active region " + resultSet.getRegionForGenotyping() + " with " + resultSet.getHaplotypeCount() + " haplotypes"); - - final List haplotypeList = resultSet.getHaplotypeList(); - - // trim down the haplotypes - final Map originalByTrimmedHaplotypes = new HashMap<>(); - - for ( final Haplotype h : haplotypeList ) { - final Haplotype trimmed = h.trim(trimmedActiveRegion.getExtendedLoc()); - - if ( trimmed != null ) { - if (originalByTrimmedHaplotypes.containsKey(trimmed)) { - if (trimmed.isReference()) { - originalByTrimmedHaplotypes.remove(trimmed); - originalByTrimmedHaplotypes.put(trimmed, h); - } - } else - originalByTrimmedHaplotypes.put(trimmed,h); - } else if (h.isReference()) - throw new IllegalStateException("trimming eliminates the reference haplotype"); - else if ( DEBUG ) { - logger.info("Throwing out haplotype " + h + " with cigar " + h.getCigar() + - " because it starts with or ends with an insertion or deletion when trimmed to " + - trimmedActiveRegion.getExtendedLoc()); - } - } - - // create the final list of trimmed haplotypes - final List trimmedHaplotypes = new ArrayList<>(originalByTrimmedHaplotypes.keySet()); - - // resort the trimmed haplotypes. - Collections.sort(trimmedHaplotypes,new HaplotypeSizeAndBaseComparator()); - final Map sortedOriginalByTrimmedHaplotypes = new LinkedHashMap<>(trimmedHaplotypes.size()); - for (final Haplotype trimmed : trimmedHaplotypes) - sortedOriginalByTrimmedHaplotypes.put(trimmed,originalByTrimmedHaplotypes.get(trimmed)); - - - if ( DEBUG ) logger.info("Trimmed region to " + trimmedActiveRegion.getLocation() + " size " + - trimmedActiveRegion.getLocation().size() + " reduced number of haplotypes from " + - haplotypeList.size() + " to only " + trimmedHaplotypes.size()); - if ( DEBUG ) - for ( final Haplotype remaining: trimmedHaplotypes ) - logger.info("Remains: " + remaining + " cigar " + remaining.getCigar()); - - return resultSet.trimTo(trimmedActiveRegion,sortedOriginalByTrimmedHaplotypes); - } - - //--------------------------------------------------------------------------------------------------------------- - // - // reduce - // - //--------------------------------------------------------------------------------------------------------------- - - @Override - public Integer reduceInit() { - return 0; - } - - @Override - public Integer reduce(List callsInRegion, Integer numCalledRegions) { - for( final VariantContext call : callsInRegion ) { - vcfWriter.add( call ); - } - return (callsInRegion.isEmpty() ? 0 : 1) + numCalledRegions; - } - - @Override - public void onTraversalDone(Integer result) { - if ( emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) ((GVCFWriter)vcfWriter).close(false); // GROSS -- engine forces us to close our own VCF writer since we wrapped it - referenceConfidenceModel.close(); - //TODO remove the need to call close here for debugging, the likelihood output stream should be managed - //TODO (open & close) at the walker, not the engine. - likelihoodCalculationEngine.close(); - logger.info("Ran local assembly on " + result + " active regions"); - } - - //--------------------------------------------------------------------------------------------------------------- - // - // private helper functions - // - //--------------------------------------------------------------------------------------------------------------- - - private void finalizeActiveRegion( final ActiveRegion activeRegion ) { - if (activeRegion.isFinalized()) return; - - if( DEBUG ) { logger.info("Assembling " + activeRegion.getLocation() + " with " + activeRegion.size() + " reads: (with overlap region = " + activeRegion.getExtendedLoc() + ")"); } - - // Loop through the reads hard clipping the adaptor and low quality tails - final List readsToUse = new ArrayList<>(activeRegion.getReads().size()); - for( final GATKSAMRecord myRead : activeRegion.getReads() ) { - GATKSAMRecord clippedRead; - if (errorCorrectReads) - clippedRead = ReadClipper.hardClipLowQualEnds( myRead, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION ); - else if (useLowQualityBasesForAssembly) - clippedRead = myRead; - else // default case: clip low qual ends of reads - clippedRead= ReadClipper.hardClipLowQualEnds( myRead, MIN_TAIL_QUALITY ); - - if ( dontUseSoftClippedBases || ! ReadUtils.hasWellDefinedFragmentSize(clippedRead) ) { - // remove soft clips if we cannot reliably clip off adapter sequence or if the user doesn't want to use soft clips at all - clippedRead = ReadClipper.hardClipSoftClippedBases(clippedRead); - } else { - // revert soft clips so that we see the alignment start and end assuming the soft clips are all matches - // TODO -- WARNING -- still possibility that unclipping the soft clips will introduce bases that aren't - // TODO -- truly in the extended region, as the unclipped bases might actually include a deletion - // TODO -- w.r.t. the reference. What really needs to happen is that kmers that occur before the - // TODO -- reference haplotype start must be removed - clippedRead = ReadClipper.revertSoftClippedBases(clippedRead); - } - - clippedRead = ( clippedRead.getReadUnmappedFlag() ? clippedRead : ReadClipper.hardClipAdaptorSequence( clippedRead ) ); - if( !clippedRead.isEmpty() && clippedRead.getCigar().getReadLength() > 0 ) { - clippedRead = ReadClipper.hardClipToRegion( clippedRead, activeRegion.getExtendedLoc().getStart(), activeRegion.getExtendedLoc().getStop() ); - if( activeRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) { - //logger.info("Keeping read " + clippedRead + " start " + clippedRead.getAlignmentStart() + " end " + clippedRead.getAlignmentEnd()); - readsToUse.add(clippedRead); - } - } - } - - // TODO -- Performance optimization: we partition the reads by sample 4 times right now; let's unify that code. - - final List downsampledReads = DownsamplingUtils.levelCoverageByPosition(ReadUtils.sortReadsByCoordinate(readsToUse), maxReadsInRegionPerSample, minReadsPerAlignmentStart); - - // handle overlapping read pairs from the same fragment - cleanOverlappingReadPairs(downsampledReads); - - activeRegion.clearReads(); - activeRegion.addAll(downsampledReads); - activeRegion.setFinalized(true); - } - - private Set filterNonPassingReads( final ActiveRegion activeRegion ) { - final Set readsToRemove = new LinkedHashSet<>(); - for( final GATKSAMRecord rec : activeRegion.getReads() ) { - if( rec.getReadLength() < MIN_READ_LENGTH || rec.getMappingQuality() < 20 || BadMateFilter.hasBadMate(rec) || (keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) { - readsToRemove.add(rec); - } - } - activeRegion.removeAll( readsToRemove ); - return readsToRemove; - } - - private GenomeLoc getPaddedLoc( final ActiveRegion activeRegion ) { - final int padLeft = Math.max(activeRegion.getExtendedLoc().getStart()-REFERENCE_PADDING, 1); - final int padRight = Math.min(activeRegion.getExtendedLoc().getStop()+REFERENCE_PADDING, referenceReader.getSequenceDictionary().getSequence(activeRegion.getExtendedLoc().getContig()).getSequenceLength()); - return getToolkit().getGenomeLocParser().createGenomeLoc(activeRegion.getExtendedLoc().getContig(), padLeft, padRight); - } - - private Map> splitReadsBySample( final Collection reads ) { - return splitReadsBySample(samplesList, reads); - } - - public static Map> splitReadsBySample( final List samplesList, final Collection reads ) { - final Map> returnMap = new HashMap<>(); - for( final String sample : samplesList) { - List readList = returnMap.get( sample ); - if( readList == null ) { - readList = new ArrayList<>(); - returnMap.put(sample, readList); - } - } - for( final GATKSAMRecord read : reads ) { - returnMap.get(read.getReadGroup().getSample()).add(read); - } - - return returnMap; - } - - /** - * Are we emitting a reference confidence in some form, or not? - * @return true if we are - */ - private boolean emitReferenceConfidence(){ - return emitReferenceConfidence != ReferenceConfidenceMode.NONE; - } - - /** - * Clean up reads/bases that overlap within read pairs - * - * @param reads the list of reads to consider - */ - private void cleanOverlappingReadPairs(final List reads) { - for ( final List perSampleReadList : splitReadsBySample(reads).values() ) { - final FragmentCollection fragmentCollection = FragmentUtils.create(perSampleReadList); - for ( final List overlappingPair : fragmentCollection.getOverlappingPairs() ) - FragmentUtils.adjustQualsOfOverlappingPairedFragments(overlappingPair); - } - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java deleted file mode 100644 index 01ab421b3..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java +++ /dev/null @@ -1,467 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.commandline.RodBinding; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.Reference; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.gatk.walkers.Window; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.variant.vcf.VCFHeader; -import org.broadinstitute.variant.vcf.VCFHeaderLine; -import org.broadinstitute.variant.vcf.VCFHeaderLineType; -import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.variant.variantcontext.Allele; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.variantcontext.VariantContextBuilder; -import org.broadinstitute.variant.variantcontext.VariantContextUtils; -import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; -import org.broadinstitute.variant.variantcontext.writer.VariantContextWriterFactory; - -import java.util.*; - -/** - * Haplotype-based resolution of variants in 2 different eval files. - * - *

- * HaplotypeResolver is a tool that takes 2 VCF files and constructs haplotypes based on the variants inside them. - * From that, it can resolve potential differences in variant calls that are inherently the same (or similar) variants. - * Records are annotated with the set and status attributes. - * - *

Input

- *

- * 2 variant files to resolve. - *

- * - *

Output

- *

- * A single consensus VCF. - *

- * - *

Examples

- *
- * java -Xmx1g -jar GenomeAnalysisTK.jar \
- *   -R ref.fasta \
- *   -T HaplotypeResolver \
- *   -V:v1 input1.vcf \
- *   -V:v2 input2.vcf \
- *   -o output.vcf
- * 
- * - */ -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) -@Reference(window=@Window(start=-HaplotypeResolver.ACTIVE_WINDOW,stop= HaplotypeResolver.ACTIVE_WINDOW)) -public class HaplotypeResolver extends RodWalker { - - protected static final String INTERSECTION_SET = "intersection"; - protected static final String SAME_STATUS = "same"; - protected static final String SOME_ALLELES_MATCH_STATUS = "someAllelesMatch"; - protected static final String SAME_START_DIFFERENT_ALLELES_STATUS = "sameStartDifferentAlleles"; - protected static final String SAME_BY_HAPLOTYPE_STATUS = "sameByHaplotype"; - protected static final String ONE_ALLELE_SUBSET_OF_OTHER_STATUS = "OneAlleleSubsetOfOther"; - protected static final String OVERLAPPING_EVENTS_STATUS = "overlappingEvents"; - - protected final static int MAX_DISTANCE_BETWEEN_MERGED_RECORDS = 50; - protected final static int MAX_HAPLOTYPE_TO_CONSIDER = 1000; - protected final static int MAX_VARIANT_SIZE_TO_CONSIDER = 100; - protected final static int ACTIVE_WINDOW = MAX_HAPLOTYPE_TO_CONSIDER + MAX_VARIANT_SIZE_TO_CONSIDER; - - @Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true) - public List> variants; - - @Output(doc="File to which variants should be written") - protected VariantContextWriter baseWriter = null; - private VariantContextWriter writer; - - /** - * Set to 'null' if you don't want the set field emitted. - */ - @Argument(fullName="setKey", shortName="setKey", doc="Key used in the INFO key=value tag emitted describing which set the combined VCF record came from", required=false) - protected String SET_KEY = "set"; - - /** - * Set to 'null' if you don't want the status field emitted. - */ - @Argument(fullName="statusKey", shortName="statusKey", doc="Key used in the INFO key=value tag emitted describing the extent to which records match", required=false) - protected String STATUS_KEY = "status"; - - private final LinkedList queue = new LinkedList(); - private String source1, source2; - private final List sourceVCs1 = new ArrayList(); - private final List sourceVCs2 = new ArrayList(); - - - private class VCcontext { - public final Collection vcs; - public final GenomeLoc loc; - public final ReferenceContext ref; - - public VCcontext(final Collection vcs, final ReferenceContext ref) { - this.vcs = vcs; - this.loc = getToolkit().getGenomeLocParser().createGenomeLoc(vcs.iterator().next()); - this.ref = ref; - } - } - - public void initialize() { - - if ( variants.size() != 2 ) { - throw new UserException.BadArgumentValue("variant", "this tool requires exactly 2 input variant files"); - } - source1 = variants.get(0).getName(); - source2 = variants.get(1).getName(); - - if ( SET_KEY.toLowerCase().equals("null") ) - SET_KEY = null; - if ( STATUS_KEY.toLowerCase().equals("null") ) - STATUS_KEY = null; - - // for now, INFO and FORMAT fields are not propagated to the output VCF (so they aren't put into the header) - Set headerLines = new HashSet(); - if ( SET_KEY != null ) - headerLines.add(new VCFInfoHeaderLine(SET_KEY, 1, VCFHeaderLineType.String, "Source VCF for the merged record")); - if ( STATUS_KEY != null ) - headerLines.add(new VCFInfoHeaderLine(STATUS_KEY, 1, VCFHeaderLineType.String, "Extent to which records match")); - final VCFHeader vcfHeader = new VCFHeader(headerLines, Collections.emptySet()); - baseWriter.writeHeader(vcfHeader); - writer = VariantContextWriterFactory.sortOnTheFly(baseWriter, ACTIVE_WINDOW); - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) - return 0; - - final Collection VCs = tracker.getValues(variants, context.getLocation()); - if ( VCs.size() == 0 ) - return 0; - - final VCcontext vc = new VCcontext(VariantContextUtils.sitesOnlyVariantContexts(VCs), ref); - - // TODO -- what should we do about filtered records? - - if ( !queue.isEmpty() ) { - - final VCcontext previous = queue.getLast(); - if ( !previous.loc.onSameContig(vc.loc) || - previous.loc.distance(vc.loc) > MAX_DISTANCE_BETWEEN_MERGED_RECORDS || - queue.getFirst().loc.distance(vc.loc) > MAX_HAPLOTYPE_TO_CONSIDER ) { - purgeQueue(); - } - } - - queue.addLast(vc); - return 0; - } - - public Integer reduceInit() { return 0; } - - public Integer reduce(Integer value, Integer sum) { - return sum + value; - } - - public void onTraversalDone(Integer result) { - if ( !queue.isEmpty() ) - purgeQueue(); - writer.close(); - } - - private void purgeQueue() { - - final ReferenceContext refContext = queue.getFirst().ref; - - // divide them up by source - while ( !queue.isEmpty() ) { - VCcontext context = queue.removeFirst(); - for ( final VariantContext vc: context.vcs ) { - if ( vc.getSource().equals(source1) ) - sourceVCs1.add(vc); - else - sourceVCs2.add(vc); - } - } - - writeAndPurgeAllEqualVariants(sourceVCs1, sourceVCs2, SAME_STATUS); - - if ( sourceVCs1.isEmpty() ) { - writeAll(sourceVCs2, source2, null); - } else if ( sourceVCs2.isEmpty() ) { - writeAll(sourceVCs1, source1, null); - } else { - resolveByHaplotype(refContext); - } - - // allow for GC of the data - sourceVCs1.clear(); - sourceVCs2.clear(); - } - - private void writeAll(final List sourceVCs, final String set, final String status) { - for ( final VariantContext vc : sourceVCs ) { - writeOne(vc, set, status); - } - } - - private void writeOne(final VariantContext vc, final String set, final String status) { - final Map attrs = new HashMap(vc.getAttributes()); - if ( SET_KEY != null && set != null ) - attrs.put(SET_KEY, set); - if ( STATUS_KEY != null && status != null ) - attrs.put(STATUS_KEY, status); - writer.add(new VariantContextBuilder(vc).attributes(attrs).make()); - } - - private void writeAndPurgeAllEqualVariants(final List sourceVCs1, final List sourceVCs2, final String status) { - - int currentIndex1 = 0, currentIndex2 = 0; - int size1 = sourceVCs1.size(), size2 = sourceVCs2.size(); - VariantContext current1 = (currentIndex1 < size1 ? sourceVCs1.get(currentIndex1): null); - VariantContext current2 = (currentIndex2 < size2 ? sourceVCs2.get(currentIndex2): null); - - while ( current1 != null && current2 != null ) { - - final GenomeLoc loc1 = getToolkit().getGenomeLocParser().createGenomeLoc(current1); - final GenomeLoc loc2 = getToolkit().getGenomeLocParser().createGenomeLoc(current2); - - if ( loc1.equals(loc2) || - (loc1.getStart() == loc2.getStart() && (current1.getAlternateAlleles().size() > 1 || current2.getAlternateAlleles().size() > 1)) ) { - // test the alleles - if ( determineAndWriteOverlap(current1, current2, status) ) { - sourceVCs1.remove(currentIndex1); - sourceVCs2.remove(currentIndex2); - size1--; - size2--; - } else { - currentIndex1++; - currentIndex2++; - } - current1 = (currentIndex1 < size1 ? sourceVCs1.get(currentIndex1): null); - current2 = (currentIndex2 < size2 ? sourceVCs2.get(currentIndex2): null); - } else if ( loc1.isBefore(loc2) ) { - currentIndex1++; - current1 = (currentIndex1 < size1 ? sourceVCs1.get(currentIndex1): null); - } else { - currentIndex2++; - current2 = (currentIndex2 < size2 ? sourceVCs2.get(currentIndex2): null); - } - } - } - - private boolean determineAndWriteOverlap(final VariantContext vc1, final VariantContext vc2, final String status) { - final int allelesFrom1In2 = findOverlap(vc1, vc2); - final int allelesFrom2In1 = findOverlap(vc2, vc1); - final int totalAllelesIn1 = vc1.getAlternateAlleles().size(); - final int totalAllelesIn2 = vc2.getAlternateAlleles().size(); - - final boolean allAllelesFrom1Overlap = allelesFrom1In2 == totalAllelesIn1; - final boolean allAllelesFrom2Overlap = allelesFrom2In1 == totalAllelesIn2; - - boolean thereIsOverlap = true; - - if ( allAllelesFrom1Overlap && allAllelesFrom2Overlap ) { - writeOne(vc1, INTERSECTION_SET, status); - } else if ( allAllelesFrom1Overlap ) { - writeOne(vc2, INTERSECTION_SET, source1 + "IsSubsetOf" + source2); - } else if ( allAllelesFrom2Overlap ) { - writeOne(vc1, INTERSECTION_SET, source2 + "IsSubsetOf" + source1); - } else if ( allelesFrom1In2 > 0 ) { - writeOne(vc1, INTERSECTION_SET, SOME_ALLELES_MATCH_STATUS); - } else if ( totalAllelesIn1 > 1 || totalAllelesIn2 > 1 ) { // we don't handle multi-allelics in the haplotype-based reconstruction - writeOne(vc1, INTERSECTION_SET, SAME_START_DIFFERENT_ALLELES_STATUS); - } else { - thereIsOverlap = false; - } - - return thereIsOverlap; - } - - private static int findOverlap(final VariantContext target, final VariantContext comparison) { - int overlap = 0; - for ( final Allele allele : target.getAlternateAlleles() ) { - if ( comparison.hasAlternateAllele(allele) ) - overlap++; - } - return overlap; - } - - private static final double SW_MATCH = 4.0; - private static final double SW_MISMATCH = -10.0; - private static final double SW_GAP = -25.0; - private static final double SW_GAP_EXTEND = -1.3; - private void resolveByHaplotype(final ReferenceContext refContext) { - - final byte[] source1Haplotype = generateHaplotype(sourceVCs1, refContext); - final byte[] source2Haplotype = generateHaplotype(sourceVCs2, refContext); - - final SWPairwiseAlignment swConsensus1 = new SWPairwiseAlignment( refContext.getBases(), source1Haplotype, SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND ); - final SWPairwiseAlignment swConsensus2 = new SWPairwiseAlignment( refContext.getBases(), source2Haplotype, SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND ); - - // protect against SW failures - if( swConsensus1.getCigar().toString().contains("S") || swConsensus1.getCigar().getReferenceLength() < 20 || - swConsensus2.getCigar().toString().contains("S") || swConsensus2.getCigar().getReferenceLength() < 20 ) { - // TODO -- handle errors appropriately - logger.debug("Bad SW alignment; aborting at " + refContext.getLocus()); - return; - } - - // order results by start position - final TreeMap source1Map = new TreeMap(GenotypingEngine.generateVCsFromAlignment(new Haplotype(source1Haplotype, false, 0, swConsensus1.getCigar()), refContext.getBases(), refContext.getWindow(), source1)); - final TreeMap source2Map = new TreeMap(GenotypingEngine.generateVCsFromAlignment(new Haplotype(source2Haplotype, false, 0, swConsensus2.getCigar()), refContext.getBases(), refContext.getWindow(), source2)); - if ( source1Map.size() == 0 || source2Map.size() == 0 ) { - // TODO -- handle errors appropriately - logger.debug("No source alleles; aborting at " + refContext.getLocus()); - return; - } - - // create lists and test for equality - final List source1Alleles = new ArrayList(source1Map.values()); - final List source2Alleles = new ArrayList(source2Map.values()); - - writeAndPurgeAllEqualVariants(source1Alleles, source2Alleles, SAME_BY_HAPLOTYPE_STATUS); - if ( source1Alleles.isEmpty() ) { - writeAll(source2Alleles, source2, null); - } else if ( source2Alleles.isEmpty() ) { - writeAll(source1Alleles, source1, null); - } else { - writeDifferences(source1Alleles, source2Alleles); - } - } - - private byte[] generateHaplotype(final List sourceVCs, final ReferenceContext refContext) { - - final StringBuilder sb = new StringBuilder(); - - final int startPos = refContext.getWindow().getStart(); - int currentPos = startPos; - final byte[] reference = refContext.getBases(); - - for ( final VariantContext vc : sourceVCs ) { - // add any missing reference context - int vcStart = vc.getStart(); - final int refAlleleLength = vc.getReference().length(); - if ( refAlleleLength == vc.getEnd() - vc.getStart() ) // this is a deletion (whereas for other events the padding base isn't part of the position) - vcStart++; - - while ( currentPos < vcStart ) - sb.append((char)reference[currentPos++ - startPos]); - - // add the alt allele - sb.append(vc.getAlternateAllele(0).getBaseString()); - - // skip the reference allele - currentPos += refAlleleLength; - } - // add any missing reference context - final int stopPos = refContext.getWindow().getStop(); - while ( currentPos < stopPos ) - sb.append((char)reference[currentPos++ - startPos]); - - return sb.toString().getBytes(); - } - - private void writeDifferences(final List source1Alleles, final List source2Alleles) { - int currentIndex1 = 0, currentIndex2 = 0; - final int size1 = source1Alleles.size(), size2 = source2Alleles.size(); - VariantContext current1 = source1Alleles.get(0); - VariantContext current2 = source2Alleles.get(0); - - while ( currentIndex1 < size1 || currentIndex2 < size2 ) { - if ( current1 == null ) { - writeOne(current2, source2, null); - currentIndex2++; - current2 = (currentIndex2 < size2 ? source2Alleles.get(currentIndex2): null); - } else if ( current2 == null ) { - writeOne(current1, source1, null); - currentIndex1++; - current1 = (currentIndex1 < size1 ? source1Alleles.get(currentIndex1): null); - } else { - - final GenomeLoc loc1 = getToolkit().getGenomeLocParser().createGenomeLoc(current1); - final GenomeLoc loc2 = getToolkit().getGenomeLocParser().createGenomeLoc(current2); - - if ( loc1.getStart() == loc2.getStart() || loc1.overlapsP(loc2) ) { - String status; - if ( loc1.getStart() == loc2.getStart() ) { - final String allele1 = current1.getAlternateAllele(0).getBaseString(); - final String allele2 = current2.getAlternateAllele(0).getBaseString(); - if ( allele1.indexOf(allele2) != -1 || allele2.indexOf(allele1) != -1 ) - status = ONE_ALLELE_SUBSET_OF_OTHER_STATUS; - else - status = SAME_START_DIFFERENT_ALLELES_STATUS; - } else { - status = OVERLAPPING_EVENTS_STATUS; - } - - writeOne(current1, INTERSECTION_SET, status); - currentIndex1++; - currentIndex2++; - current1 = (currentIndex1 < size1 ? source1Alleles.get(currentIndex1): null); - current2 = (currentIndex2 < size2 ? source2Alleles.get(currentIndex2): null); - } else if ( loc1.isBefore(loc2) ) { - writeOne(current1, source1, null); - currentIndex1++; - current1 = (currentIndex1 < size1 ? source1Alleles.get(currentIndex1): null); - } else { - writeOne(current2, source2, null); - currentIndex2++; - current2 = (currentIndex2 < size2 ? source2Alleles.get(currentIndex2): null); - } - } - } - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerSequence.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerSequence.java deleted file mode 100644 index a6c35bce0..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerSequence.java +++ /dev/null @@ -1,461 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; - - -import com.sun.istack.internal.NotNull; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.utils.haplotype.Haplotype; - -import java.lang.reflect.Array; -import java.util.*; - -/** - * Represent a sequence of kmers where any two consecutive kmers overlap in kmer length - 1 elements. - * - * @author Valentin Ruano-Rubio <valentin@broadinstitute.com> - */ -public class KmerSequence implements List { - private final byte[] sequence; - private final int start; - private final int size; - private final int kmerSize; - private final int rawLength; - - /** - * Creates a kmer sequence from a read's sequence. - * - * @param read the read to represent as a sequence of kmers. - * @param kmerSize the kmer size. - */ - public KmerSequence(final SAMRecord read, final int kmerSize) { - this(read.getReadBases(), kmerSize); - } - - /** - * Creates a kmer sequence from a haplotype's sequence. - * - * @param hap the haplotype to represent as a sequence of kmers. - * @param kmerSize the kmer size. - */ - public KmerSequence(final Haplotype hap, final int kmerSize) { - this(hap.getBases(), kmerSize); - } - - /** - * Creates a kmer sequence out of a byte sequence. - * - * @param sequence the byte array to represent as a kmer sequence. - * @param kmerSize the kmer size. - */ - public KmerSequence(final byte[] sequence, final int kmerSize) { - this(sequence,0,Math.max(0,sequence.length - kmerSize + 1),kmerSize, sequence.length); - } - - - /** - * Creates a kmer sequence out of a range of a byte array - * - * @param sequence the input array. - * @param start inclusive first position of the array that maps to the first position in the first kmer. - * @param size number kmers in the output. - * @param kmerSize kmer length in bases. - * @param rawLength the of the range in bases. - */ - protected KmerSequence(final byte[] sequence, final int start, final int size, final int kmerSize, final int rawLength) { - if (sequence == null) { - throw new IllegalArgumentException("start must be 0 or greater"); - } - if (rawLength > sequence.length - start) { - throw new IllegalArgumentException("the raw sequence length goes beyond the array capacity"); - } - if (size < 0) { - throw new IllegalArgumentException("the length cannot be negative"); - } - if (start < 0) { - throw new IllegalArgumentException("start must be 0 or greater"); - } - if (size > 0 && size + kmerSize - 1 > rawLength) { - throw new IllegalArgumentException( - String.format("the kmerSize (%d) + size (%d) - 1 cannot be larger than rawLength (%d)",kmerSize,size,rawLength) ); - } - this.sequence = sequence; - this.start = start; - this.size = size; - this.kmerSize = kmerSize; - this.rawLength = rawLength; - } - - public int kmerSize() { - return kmerSize; - } - - public KmerSequence subsequence(final int from, final int to) { - if (from < 0 || from > to) { - throw new IllegalArgumentException(); - } - if (to > size) { - throw new IllegalArgumentException(); - } - return new KmerSequence(sequence,this.start + from,to - from,kmerSize,rawLength - from - (size - to)); - } - - - @Override - public int size() { - return size; - } - - @Override - public boolean isEmpty() { - return size == 0; - } - - @Override - public boolean contains(final Object o) { - if (o instanceof Kmer) { - if (o instanceof MyKmer) { - final MyKmer k = (MyKmer) o; - if (k.bases == sequence && k.start >= start && k.length == kmerSize && k.start < start + size) { - return true; - } - } - final Kmer k = (Kmer) o; - if (k.length != kmerSize) { - return false; - } - for (int i = 0; i < size; i++) { - int j; - for (j = 0; j < kmerSize; j++) { - if (sequence[start + i + j] != k.bases[k.start + j]) { - break; - } - } - if (j == kmerSize) { - return true; - } - } - return false; - } else { - return false; - } - } - - @Override - @NotNull - public Iterator iterator() { - return new Iterator() { - - private int offset = 0; - - @Override - public boolean hasNext() { - return offset < size; - } - - @Override - public Kmer next() { - return new Kmer(sequence,start + offset,kmerSize); - } - - @Override - public void remove() { - throw new UnsupportedOperationException(); - } - }; - } - - @NotNull - @Override - public Object[] toArray() { - return toArray(new Kmer[size()]); - } - - @Override - @NotNull - @SuppressWarnings("unchecked") - public T[] toArray(@NotNull final T[] a) { - if (a == null) { - throw new IllegalArgumentException(); - } else if (!a.getClass().getComponentType().isAssignableFrom(Kmer.class)) { - throw new IllegalArgumentException(); - } else { - T[] result; - if (a.length < size) { - result = (T[]) Array.newInstance(a.getClass().getComponentType(), size); - } else { - result = a; - } - for (int i = 0; i < size; i++) { - result[i] = (T) new Kmer(sequence,start + i,kmerSize); - } - return result; - } - } - - @Override - public boolean add(final Kmer kmer) { - throw new UnsupportedOperationException(); - } - - @Override - public boolean remove(final Object o) { - throw new UnsupportedOperationException(); - } - - @Override - public boolean containsAll(final Collection c) { - for (final Object o : c) - if (!contains(o)) - return false; - return true; - } - - @Override - public boolean addAll(final Collection c) { - throw new UnsupportedOperationException(); - } - - @Override - public boolean addAll(final int index, @NotNull final Collection c) { - throw new UnsupportedOperationException(); - } - - @Override - public boolean removeAll(@NotNull final Collection c) { - throw new UnsupportedOperationException(); - } - - @Override - public boolean retainAll(@NotNull final Collection c) { - throw new UnsupportedOperationException(); - } - - @Override - public void clear() { - throw new UnsupportedOperationException(); - } - - @Override - public Kmer get(final int index) { - if (index < 0 || index >= size) { - throw new IllegalArgumentException(); - } - return new Kmer(sequence,start + index,kmerSize); - } - - @Override - public Kmer set(final int index, final Kmer element) { - throw new UnsupportedOperationException(); - } - - @Override - public void add(final int index, final Kmer element) { - throw new UnsupportedOperationException(); - } - - @Override - public Kmer remove(final int index) { - throw new UnsupportedOperationException(); - } - - @Override - public int indexOf(final Object o) { - if (o instanceof Kmer) { - final Kmer k = (Kmer) o; - if (k.length != kmerSize) { - return -1; - } - for (int i = 0; i < size; i++) { - int j; - for (j = 0; j < kmerSize; j++) { - if (sequence[start + i + j] != k.bases[k.start + j]) { - break; - } - } - if (j == kmerSize) { - return i; - } - } - return -1; - } else { - return -1; - } - } - - @Override - public int lastIndexOf(final Object o) { - if (o instanceof Kmer) { - final Kmer k = (Kmer) o; - if (k.length != kmerSize) { - return -1; - } - for (int i = size - 1; i >= 0; i--) { - int j; - for (j = kmerSize - 1; j >= 0; j--) { - if (sequence[start + i + j] != k.bases[k.start + j]) { - break; - } - } - if (j == 0) { - return i; - } - } - return -1; - } else { - return -1; - } - } - - @Override - @NotNull - public ListIterator listIterator() { - return new MyListIterator(0); - } - - @Override - @NotNull - public ListIterator listIterator(final int index) { - return new MyListIterator(index); - } - - @Override - @NotNull - public List subList(final int fromIndex, final int toIndex) { - return subsequence(fromIndex,toIndex); - } - - /** - * Returns the byte array representation of the kmer sequence. - * @return never {@code null}. - */ - @NotNull - public byte[] getBytes() { - if (start == 0 && rawLength == sequence.length) - return sequence; - else - return Arrays.copyOfRange(sequence, start, rawLength + start); - } - - /** - * Internal class that implements the {@link Kmer} more efficiently - * making reference to the sequence's own byte array. - */ - protected class MyKmer extends Kmer { - - /** - * Create a new instance give the offset in the byte array. - * @param start the start base offset for the kmer. - */ - public MyKmer(final int start) { - super(sequence,start,kmerSize); - } - } - - /** - * Iterator implementation of Kmer elements. - */ - private class MyListIterator implements ListIterator { - - private int i = 0; - - /** - * Creates a iterator at certain offset in the sequence. - * @param idx the start position or kmer offset. - */ - private MyListIterator(final int idx) { - i = idx; - } - - @Override - public boolean hasNext() { - return i < size; - } - - @Override - public Kmer next() { - return new Kmer(sequence,start + i++,kmerSize); - } - - @Override - public boolean hasPrevious() { - return i > 0; - } - - @Override - public Kmer previous() { - return new Kmer(sequence,start + --i,kmerSize); - } - - @Override - public int nextIndex() { - return i; - } - - @Override - public int previousIndex() { - return i - 1; - } - - @Override - public void remove() { - throw new UnsupportedOperationException(); - } - - @Override - public void set(final Kmer kmer) { - throw new UnsupportedOperationException(); - } - - @Override - public void add(final Kmer kmer) { - throw new UnsupportedOperationException(); - } - - } - -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModel.java deleted file mode 100644 index 4ec56f706..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModel.java +++ /dev/null @@ -1,513 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; - -import net.sf.samtools.*; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.activeregion.ActiveRegion; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.haplotypeBAMWriter.HaplotypeBAMWriter; -import org.broadinstitute.sting.utils.haplotypeBAMWriter.ReadDestination; -import org.broadinstitute.sting.utils.locusiterator.LocusIteratorByState; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; -import org.broadinstitute.sting.utils.sam.AlignmentUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.variantcontext.*; -import org.broadinstitute.variant.vcf.VCFFormatHeaderLine; -import org.broadinstitute.variant.vcf.VCFHeaderLine; -import org.broadinstitute.variant.vcf.VCFHeaderLineType; -import org.broadinstitute.variant.vcf.VCFSimpleHeaderLine; - -import java.io.File; -import java.util.*; - -/** - * Code for estimating the reference confidence - * - * This code can estimate the probability that the data for a single sample is consistent with a - * well-determined REF/REF diploid genotype. - * - * User: depristo - * Date: 6/21/13 - * Time: 12:52 PM - */ -public class ReferenceConfidenceModel { - - //public final static String INDEL_INFORMATIVE_DEPTH = "CD"; // temporarily taking this extra genotype level information out for now - public final static String ALTERNATE_ALLELE_STRING = "ALT"; // arbitrary alternate allele - - private final GenomeLocParser genomeLocParser; - private final Set samples; - private final SAMFileHeader header; // TODO -- really shouldn't depend on this - private final int indelInformativeDepthIndelSize; - - private final static boolean WRITE_DEBUGGING_BAM = false; - private final SAMFileWriter debuggingWriter; - - private final static byte REF_MODEL_DELETION_QUAL = (byte) 30; - - /** - * Create a new ReferenceConfidenceModel - * - * @param genomeLocParser how we create genome locs - * @param samples the list of all samples we'll be considering with this model - * @param header the SAMFileHeader describing the read information (used for debugging) - * @param indelInformativeDepthIndelSize the max size of indels to consider when calculating indel informative depths - */ - public ReferenceConfidenceModel(final GenomeLocParser genomeLocParser, - final Set samples, - final SAMFileHeader header, - final int indelInformativeDepthIndelSize) { - if ( genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser cannot be null"); - if ( samples == null ) throw new IllegalArgumentException("samples cannot be null"); - if ( samples.isEmpty() ) throw new IllegalArgumentException("samples cannot be empty"); - if ( header == null ) throw new IllegalArgumentException("header cannot be empty"); - if ( indelInformativeDepthIndelSize < 0) throw new IllegalArgumentException("indelInformativeDepthIndelSize must be >= 1 but got " + indelInformativeDepthIndelSize); - - this.genomeLocParser = genomeLocParser; - this.samples = samples; - this.header = header; - this.indelInformativeDepthIndelSize = indelInformativeDepthIndelSize; - - if ( WRITE_DEBUGGING_BAM ) { - final SAMFileWriterFactory factory = new SAMFileWriterFactory(); - factory.setCreateIndex(true); - debuggingWriter = factory.makeBAMWriter(header, false, new File("refCalc.bam")); - } else { - debuggingWriter = null; - } - - initializeIndelPLCache(); - } - - /** - * Get the VCF header lines to include when emitting reference confidence values via calculateRefConfidence - * @return a non-null set of VCFHeaderLines - */ - public Set getVCFHeaderLines() { - final Set headerLines = new LinkedHashSet<>(); - // TODO - do we need a new kind of VCF Header subclass for specifying arbitrary alternate alleles? - headerLines.add(new VCFSimpleHeaderLine(ALTERNATE_ALLELE_STRING, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE_NAME, "Represents any possible alternative allele at this location")); - //headerLines.add(new VCFFormatHeaderLine(INDEL_INFORMATIVE_DEPTH, 1, VCFHeaderLineType.Integer, "Number of reads at locus that are informative about an indel of size <= " + indelInformativeDepthIndelSize)); - return headerLines; - } - - /** - * Close down this reference model, closing down any debugging information opened during execution - */ - public void close() { - if ( debuggingWriter != null ) debuggingWriter.close(); - } - - - /** - * Calculate the reference confidence for a single sample given the its read data - * - * Returns a list of variant contexts, one for each position in the activeregion.getLoc(), each containing - * detailed information about the certainty that the sample is hom-ref for each base in the region. - * - * - * - * @param refHaplotype the reference haplotype, used to get the reference bases across activeRegion.getLoc() - * @param calledHaplotypes a list of haplotypes that segregate in this region, for realignment of the reads in the - * stratifiedReadMap, corresponding to each reads best haplotype. Must contain the refHaplotype. - * @param paddedReferenceLoc the location of refHaplotype (which might be larger than activeRegion.getLoc()) - * @param activeRegion the active region we want to get the reference confidence over - * @param stratifiedReadMap a map from a single sample to its PerReadAlleleLikelihoodMap for each haplotype in calledHaplotypes - * @param variantCalls calls made in this region. The return result will contain any variant call in this list in the - * correct order by genomic position, and any variant in this list will stop us emitting a ref confidence - * under any position it covers (for snps and insertions that is 1 bp, but for deletions its the entire ref span) - * @return an ordered list of variant contexts that spans activeRegion.getLoc() and includes both reference confidence - * contexts as well as calls from variantCalls if any were provided - */ - public List calculateRefConfidence(final Haplotype refHaplotype, - final Collection calledHaplotypes, - final GenomeLoc paddedReferenceLoc, - final ActiveRegion activeRegion, - final Map stratifiedReadMap, - final List variantCalls) { - if ( refHaplotype == null ) throw new IllegalArgumentException("refHaplotype cannot be null"); - if ( calledHaplotypes == null ) throw new IllegalArgumentException("calledHaplotypes cannot be null"); - if ( !calledHaplotypes.contains(refHaplotype)) throw new IllegalArgumentException("calledHaplotypes must contain the refHaplotype"); - if ( paddedReferenceLoc == null ) throw new IllegalArgumentException("paddedReferenceLoc cannot be null"); - if ( activeRegion == null ) throw new IllegalArgumentException("activeRegion cannot be null"); - if ( stratifiedReadMap == null ) throw new IllegalArgumentException("stratifiedReadMap cannot be null"); - if ( stratifiedReadMap.size() != 1 ) throw new IllegalArgumentException("stratifiedReadMap must contain exactly one sample but it contained " + stratifiedReadMap.size()); - if ( refHaplotype.length() != activeRegion.getExtendedLoc().size() ) throw new IllegalArgumentException("refHaplotype " + refHaplotype.length() + " and activeRegion location size " + activeRegion.getLocation().size() + " are different"); - - final GenomeLoc refSpan = activeRegion.getLocation(); - final List refPileups = getPileupsOverReference(refHaplotype, calledHaplotypes, paddedReferenceLoc, activeRegion, refSpan, stratifiedReadMap); - final byte[] ref = refHaplotype.getBases(); - final List results = new ArrayList<>(refSpan.size()); - final String sampleName = stratifiedReadMap.keySet().iterator().next(); - - final int globalRefOffset = refSpan.getStart() - activeRegion.getExtendedLoc().getStart(); - for ( final ReadBackedPileup pileup : refPileups ) { - final GenomeLoc curPos = pileup.getLocation(); - final int offset = curPos.getStart() - refSpan.getStart(); - - final VariantContext overlappingSite = getOverlappingVariantContext(curPos, variantCalls); - if ( overlappingSite != null ) { - // we have some overlapping site, add it to the list of positions - if ( overlappingSite.getStart() == curPos.getStart() ) - results.add(overlappingSite); - } else { - // otherwise emit a reference confidence variant context - final int refOffset = offset + globalRefOffset; - final byte refBase = ref[refOffset]; - final RefVsAnyResult homRefCalc = calcGenotypeLikelihoodsOfRefVsAny(pileup, refBase, (byte)6, null); - homRefCalc.capByHomRefLikelihood(); - - final Allele refAllele = Allele.create(refBase, true); - final List refSiteAlleles = Arrays.asList(refAllele, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - final VariantContextBuilder vcb = new VariantContextBuilder("HC", curPos.getContig(), curPos.getStart(), curPos.getStart(), refSiteAlleles); - final GenotypeBuilder gb = new GenotypeBuilder(sampleName, Arrays.asList(refAllele, refAllele)); - gb.AD(homRefCalc.AD_Ref_Any); - gb.DP(homRefCalc.getDP()); - - // genotype likelihood calculation - final GenotypeLikelihoods snpGLs = GenotypeLikelihoods.fromLog10Likelihoods(homRefCalc.genotypeLikelihoods); - final int nIndelInformativeReads = calcNIndelInformativeReads(pileup, refOffset, ref, indelInformativeDepthIndelSize); - final GenotypeLikelihoods indelGLs = getIndelPLs(nIndelInformativeReads); - - // now that we have the SNP and indel GLs, we take the one with the least confidence, - // as this is the most conservative estimate of our certainty that we are hom-ref. - // For example, if the SNP PLs are 0,10,100 and the indel PLs are 0,100,1000 - // we are very certain that there's no indel here, but the SNP confidence imply that we are - // far less confident that the ref base is actually the only thing here. So we take 0,10,100 - // as our GLs for the site. - final GenotypeLikelihoods leastConfidenceGLs = getGLwithWorstGQ(indelGLs, snpGLs); - - gb.GQ((int) (-10 * leastConfidenceGLs.getLog10GQ(GenotypeType.HOM_REF))); - gb.PL(leastConfidenceGLs.getAsPLs()); - //gb.attribute(INDEL_INFORMATIVE_DEPTH, nIndelInformativeReads); - - vcb.genotypes(gb.make()); - results.add(vcb.make()); -// logger.info(" => VariantContext " + vcb.make()); - } - } - - return results; - } - - /** - * Get the GenotypeLikelihoods with the least strong corresponding GQ value - * @param gl1 first to consider (cannot be null) - * @param gl2 second to consider (cannot be null) - * @return gl1 or gl2, whichever has the worst GQ - */ - protected final GenotypeLikelihoods getGLwithWorstGQ(final GenotypeLikelihoods gl1, final GenotypeLikelihoods gl2) { - return gl1.getLog10GQ(GenotypeType.HOM_REF) > gl2.getLog10GQ(GenotypeType.HOM_REF) ? gl1 : gl2; - } - - /** - * Get indel PLs corresponding to seeing N nIndelInformativeReads at this site - * - * @param nInformativeReads the number of reads that inform us about being ref without an indel at this site - * @return non-null GenotypeLikelihoods given N - */ - protected final GenotypeLikelihoods getIndelPLs(final int nInformativeReads) { - return indelPLCache[nInformativeReads > MAX_N_INDEL_INFORMATIVE_READS ? MAX_N_INDEL_INFORMATIVE_READS : nInformativeReads]; - } - - protected static final int MAX_N_INDEL_INFORMATIVE_READS = 40; // more than this is overkill because GQs are capped at 99 anyway - private static final GenotypeLikelihoods[] indelPLCache = new GenotypeLikelihoods[MAX_N_INDEL_INFORMATIVE_READS + 1]; - private static final double INDEL_ERROR_RATE = -4.5; // 10^-4.5 indel errors per bp - - private void initializeIndelPLCache() { - for( int nInformativeReads = 0; nInformativeReads <= MAX_N_INDEL_INFORMATIVE_READS; nInformativeReads++ ) { - final double homRef = 0.0; - final double het = MathUtils.LOG_ONE_HALF * nInformativeReads; - final double homVar = INDEL_ERROR_RATE * nInformativeReads; - indelPLCache[nInformativeReads] = GenotypeLikelihoods.fromLog10Likelihoods(new double[]{homRef, het, homVar}); - } - } - - /** - * Calculate the genotype likelihoods for the sample in pileup for being hom-ref contrasted with being ref vs. alt - * - * @param pileup the read backed pileup containing the data we want to evaluate - * @param refBase the reference base at this pileup position - * @param minBaseQual the min base quality for a read in the pileup at the pileup position to be included in the calculation - * @param hqSoftClips running average data structure (can be null) to collect information about the number of high quality soft clips - * @return a RefVsAnyResult genotype call - */ - public RefVsAnyResult calcGenotypeLikelihoodsOfRefVsAny(final ReadBackedPileup pileup, final byte refBase, final byte minBaseQual, final MathUtils.RunningAverage hqSoftClips) { - final RefVsAnyResult result = new RefVsAnyResult(); - - for( final PileupElement p : pileup ) { - final byte qual = (p.isDeletion() ? REF_MODEL_DELETION_QUAL : p.getQual()); - if( p.isDeletion() || qual > minBaseQual ) { - int AA = 0; final int AB = 1; int BB = 2; - if( p.getBase() != refBase || p.isDeletion() || p.isBeforeDeletionStart() || p.isAfterDeletionEnd() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip() ) { - AA = 2; - BB = 0; - if( hqSoftClips != null && p.isNextToSoftClip() ) { - hqSoftClips.add(AlignmentUtils.calcNumHighQualitySoftClips(p.getRead(), (byte) 28)); - } - result.AD_Ref_Any[1] += p.getRepresentativeCount(); - } else { - result.AD_Ref_Any[0] += p.getRepresentativeCount(); - } - result.genotypeLikelihoods[AA] += p.getRepresentativeCount() * QualityUtils.qualToProbLog10(qual); - result.genotypeLikelihoods[AB] += p.getRepresentativeCount() * MathUtils.approximateLog10SumLog10( QualityUtils.qualToProbLog10(qual) + MathUtils.LOG_ONE_HALF, QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD + MathUtils.LOG_ONE_HALF ); - result.genotypeLikelihoods[BB] += p.getRepresentativeCount() * QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD; - } - } - - return result; - } - - /** - * Get a list of pileups that span the entire active region span, in order, one for each position - */ - private List getPileupsOverReference(final Haplotype refHaplotype, - final Collection calledHaplotypes, - final GenomeLoc paddedReferenceLoc, - final ActiveRegion activeRegion, - final GenomeLoc activeRegionSpan, - final Map stratifiedReadMap) { - - if ( refHaplotype == null ) throw new IllegalArgumentException("refHaplotype cannot be null"); - if ( calledHaplotypes == null ) throw new IllegalArgumentException("calledHaplotypes cannot be null"); - if ( !calledHaplotypes.contains(refHaplotype)) throw new IllegalArgumentException("calledHaplotypes must contain the refHaplotype"); - if ( paddedReferenceLoc == null ) throw new IllegalArgumentException("paddedReferenceLoc cannot be null"); - if ( activeRegion == null ) throw new IllegalArgumentException("activeRegion cannot be null"); - if ( stratifiedReadMap == null ) throw new IllegalArgumentException("stratifiedReadMap cannot be null"); - if ( stratifiedReadMap.size() != 1 ) throw new IllegalArgumentException("stratifiedReadMap must contain exactly one sample but it contained " + stratifiedReadMap.size()); - - List realignedReads; - - if( calledHaplotypes.size() == 1 ) { // only contains ref haplotype so an optimization is to just trust the alignments to the reference haplotype as provided by the aligner - realignedReads = activeRegion.getReads(); - } else { - final ReadDestination.ToList realignedReadsDest = new ReadDestination.ToList(header, "FOO"); - final HaplotypeBAMWriter writer = HaplotypeBAMWriter.create(HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES, realignedReadsDest); - writer.setWriteHaplotypesAsWell(false); // don't write out reads for the haplotypes, as we only want the realigned reads themselves - writer.setOnlyRealignInformativeReads(true); - writer.writeReadsAlignedToHaplotypes(calledHaplotypes, paddedReferenceLoc, stratifiedReadMap); - realignedReads = ReadUtils.sortReadsByCoordinate(realignedReadsDest.getReads()); - } - - if ( debuggingWriter != null ) - for ( final GATKSAMRecord read : realignedReads ) - debuggingWriter.addAlignment(read); - - final LocusIteratorByState libs = new LocusIteratorByState(realignedReads.iterator(), LocusIteratorByState.NO_DOWNSAMPLING, - true, genomeLocParser, samples, false); - - final List pileups = new LinkedList<>(); - final int startPos = activeRegionSpan.getStart(); - AlignmentContext next = libs.advanceToLocus(startPos, true); - for ( int curPos = startPos; curPos <= activeRegionSpan.getStop(); curPos++ ) { - if ( next != null && next.getLocation().getStart() == curPos ) { - pileups.add(next.getBasePileup()); - next = libs.hasNext() ? libs.next() : null; - } else { - // no data, so we create empty pileups - pileups.add(new ReadBackedPileupImpl(genomeLocParser.createGenomeLoc(activeRegionSpan.getContig(), curPos))); - } - } - - return pileups; - } - - /** - * Return the rightmost variant context in maybeOverlapping that overlaps curPos - * - * @param curPos non-null genome loc - * @param maybeOverlapping a collection of variant contexts that might overlap curPos - * @return a VariantContext, or null if none overlaps - */ - protected final VariantContext getOverlappingVariantContext(final GenomeLoc curPos, final Collection maybeOverlapping) { - VariantContext overlaps = null; - for ( final VariantContext vc : maybeOverlapping ) { - if ( genomeLocParser.createGenomeLoc(vc).overlapsP(curPos) ) { - if ( overlaps == null || vc.getStart() > overlaps.getStart() ) { - overlaps = vc; - } - } - } - return overlaps; - } - - /** - * Compute the sum of mismatching base qualities for readBases aligned to refBases at readStart / refStart - * assuming no insertions or deletions in the read w.r.t. the reference - * - * @param readBases non-null bases of the read - * @param readQuals non-null quals of the read - * @param readStart the starting position of the read (i.e., that aligns it to a position in the reference) - * @param refBases the reference bases - * @param refStart the offset into refBases that aligns to the readStart position in readBases - * @param maxSum if the sum goes over this value, return immediately - * @return the sum of quality scores for readBases that mismatch their corresponding ref bases - */ - protected final int sumMismatchingQualities(final byte[] readBases, - final byte[] readQuals, - final int readStart, - final byte[] refBases, - final int refStart, - final int maxSum) { - final int n = Math.min(readBases.length - readStart, refBases.length - refStart); - int sum = 0; - - for ( int i = 0; i < n; i++ ) { - final byte readBase = readBases[readStart + i]; - final byte refBase = refBases[refStart + i]; - if ( readBase != refBase ) { - sum += readQuals[readStart + i]; - if ( sum > maxSum ) // abort early - return sum; - } - } - - return sum; - } - - /** - * Compute whether a read is informative to eliminate an indel of size <= maxIndelSize segregating at readStart/refStart - * - * @param readBases non-null bases of the read - * @param readQuals non-null quals of the read - * @param readStart the starting position of the read (i.e., that aligns it to a position in the reference) - * @param refBases the reference bases - * @param refStart the offset into refBases that aligns to the readStart position in readBases - * @param maxIndelSize the max indel size to consider for the read to be informative - * @return true if read can eliminate the possibility that there's an indel of size <= maxIndelSize segregating at refStart - */ - protected boolean isReadInformativeAboutIndelsOfSize(final byte[] readBases, - final byte[] readQuals, - final int readStart, - final byte[] refBases, - final int refStart, - final int maxIndelSize) { - // fast exit when n bases left < maxIndelSize - if( readBases.length - readStart < maxIndelSize || refBases.length - refStart < maxIndelSize ) { - return false; - } - - final int baselineMMSum = sumMismatchingQualities(readBases, readQuals, readStart, refBases, refStart, Integer.MAX_VALUE); - - // consider each indel size up to max in term, checking if an indel that deletes either the ref bases (deletion - // or read bases (insertion) would fit as well as the origin baseline sum of mismatching quality scores - for ( int indelSize = 1; indelSize <= maxIndelSize; indelSize++ ) { - for ( final boolean checkInsertion : Arrays.asList(true, false) ) { - final int readI, refI; - if ( checkInsertion ) { - readI = readStart + indelSize; - refI = refStart; - } else { - readI = readStart; - refI = refStart + indelSize; - } - - final int score = sumMismatchingQualities(readBases, readQuals, readI, refBases, refI, baselineMMSum); - if ( score <= baselineMMSum ) - return false; - } - } - - return true; - } - - /** - * Calculate the number of indel informative reads at pileup - * - * @param pileup a pileup - * @param pileupOffsetIntoRef the position of the pileup in the reference - * @param ref the ref bases - * @param maxIndelSize maximum indel size to consider in the informativeness calculation - * @return an integer >= 0 - */ - protected final int calcNIndelInformativeReads(final ReadBackedPileup pileup, final int pileupOffsetIntoRef, final byte[] ref, final int maxIndelSize) { - int nInformative = 0; - for ( final PileupElement p : pileup ) { - final GATKSAMRecord read = p.getRead(); - final int offset = p.getOffset(); - - // doesn't count as evidence - if ( p.isBeforeDeletionStart() || p.isBeforeInsertion() || p.isDeletion() ) - continue; - - // todo -- this code really should handle CIGARs directly instead of relying on the above tests - if ( isReadInformativeAboutIndelsOfSize(read.getReadBases(), read.getBaseQualities(), offset, ref, pileupOffsetIntoRef, maxIndelSize) ) { - nInformative += p.getRepresentativeCount(); - if( nInformative > MAX_N_INDEL_INFORMATIVE_READS ) { - return MAX_N_INDEL_INFORMATIVE_READS; - } - } - } - return nInformative; - } - - /** - * Create a reference haplotype for an active region - * - * @param activeRegion the active region - * @param refBases the ref bases - * @param paddedReferenceLoc the location spanning of the refBases -- can be longer than activeRegion.getLocation() - * @return a reference haplotype - */ - public static Haplotype createReferenceHaplotype(final ActiveRegion activeRegion, final byte[] refBases, final GenomeLoc paddedReferenceLoc) { - final Haplotype refHaplotype = new Haplotype(refBases, true); - final int alignmentStart = activeRegion.getExtendedLoc().getStart() - paddedReferenceLoc.getStart(); - if ( alignmentStart < 0 ) throw new IllegalStateException("Bad alignment start in createReferenceHaplotype " + alignmentStart); - refHaplotype.setAlignmentStartHapwrtRef(alignmentStart); - final Cigar c = new Cigar(); - c.add(new CigarElement(refHaplotype.getBases().length, CigarOperator.M)); - refHaplotype.setCigar(c); - return refHaplotype; - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Route.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Route.java deleted file mode 100644 index 1cf986c00..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Route.java +++ /dev/null @@ -1,285 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; - - -import java.util.List; -import java.util.ListIterator; - -/** - * Represents a route or path through a graph. - *

- * In contrast with a {@link Path}, a route keeps track of the - * path taken at furcations in order to speed up some path comparisions like the - * one implemented by {@link #isSuffix}. - *

- * - * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> - */ -public class Route extends Path { - - protected final Route previousRouteWithLastVertexThatIsForkOrJoin; - protected final boolean lastVertexIsForkOrJoin; - - /** - * Create a zero length route with a start in a particular vertex: - * - * @param initialVertex the first vertex of the route. - * @param graph the new route's graph. - * - * @throws IllegalArgumentException if {@code initialVertex} or {@code graph} are {@code null}. - * or if {@code initialVertex} does not belong to {@code graph}. - */ - public Route(final V initialVertex, final BaseGraph graph) { - super(initialVertex, graph); - previousRouteWithLastVertexThatIsForkOrJoin = null; - lastVertexIsForkOrJoin = graph.inDegreeOf(initialVertex) > 1; - } - - @Override - public boolean equals(final Object other) { - if (other == null) return false; - if (other == this) return true; - if (! (other instanceof Route)) return false; - @SuppressWarnings("unchecked") - final Route otherRoute = (Route) other; - return otherRoute.length() == this.length() && isSuffix(otherRoute); - } - - /** - * Extends a route into a new instance. - * - * @param prefix the route to extend. - * @param nextVertex the vertex to extend the route to. - * - * @throws IllegalArgumentException if {@code prefix} is {@code null} or {@code nextVertex} is {@code null} - * or {@code nextVertex} does not belong to {@code prefix}'s graph or there is no edge that in the graph - * that would connect {@code prefix}'s last vertex with {@code nextVertex} directly. - */ - public Route(final Route prefix, final V nextVertex) { - this(prefix,resolveSuffixEdge(prefix,nextVertex)); - } - - - /** - * Extends a route into a new instance. - * - * @param prevVertex the vertex to extend the route to. - * @param suffix the route to extend. - * - * @throws IllegalArgumentException if {@code suffix} is {@code null} or {@code prevVertex} is {@code null} - * or {@code prevVertex} does not belong to {@code suffix}'s graph or there is no edge that in the graph - * that would connect {@code suffix}'s first vertex with {@code prevVertex} directly. - */ - public Route(final V prevVertex, final Route suffix) { - this(resolvePrefixEdge(prevVertex, suffix),suffix); - } - - /** - * Resolves the prefix edge as required by {@link Route(V,Route)}. - */ - private static E resolvePrefixEdge(final V prevVertex, final Route suffix) { - if (prevVertex == null) throw new NullPointerException(); - if (!suffix.getGraph().containsVertex(prevVertex)) throw new IllegalArgumentException(); - final E result = suffix.getGraph().getEdge(prevVertex,suffix.getFirstVertex()); - if (result == null) - throw new IllegalArgumentException("there is no such edge in the graph"); - return result; - } - - /** - * Resolves the suffix edge as required by {@link Route(Route,V)} - */ - private static E resolveSuffixEdge(final Route prefix, final V nextVertex) { - if (nextVertex == null) throw new IllegalArgumentException(); - if (!prefix.getGraph().containsVertex(nextVertex)) throw new IllegalArgumentException(); - final E result = prefix.getGraph().getEdge(prefix.getLastVertex(),nextVertex); - if (result == null) - throw new IllegalArgumentException("there is no such edge in the graph"); - return result; - } - - /** - * Extends a route by prefixing an edge. - * - * @param initialEdge the extending edge. - * @param suffix the original path. - * - * @throws IllegalArgumentException if {@code suffix} or {@code initialEdge} are {@code null}, or {@code initialEdge} is - * not part of {@code suffix}'s graph, or {@code initialEdge} does not have as a target the first vertex in {@code suffix}. - */ - public Route(final E initialEdge, final Route suffix) { - super(initialEdge,suffix); - final V firstVertex = getFirstVertex(); - if(suffix.length() == 0) { - lastVertexIsForkOrJoin = suffix.lastVertexIsForkOrJoin || graph.outDegreeOf(firstVertex) > 1; - previousRouteWithLastVertexThatIsForkOrJoin = graph.inDegreeOf(firstVertex) > 1 ? new Route<>(firstVertex,graph) : null; - } else { - lastVertexIsForkOrJoin = suffix.lastVertexIsForkOrJoin; - if (suffix.previousRouteWithLastVertexThatIsForkOrJoin != null) - previousRouteWithLastVertexThatIsForkOrJoin = new Route<>(initialEdge,suffix.previousRouteWithLastVertexThatIsForkOrJoin); - else - previousRouteWithLastVertexThatIsForkOrJoin = graph.outDegreeOf(firstVertex) > 1 ? - new Route<>(new Route<>(firstVertex,graph),edgesInOrder.get(0)) : - graph.inDegreeOf(firstVertex) > 1 ? new Route<>(firstVertex,graph) : null; - } - } - - /** - * Create copy of an existing route. - * @param route the route to copy - * - * @throws NullPointerException if {@code route} is {@code null}. - */ - protected Route(final Route route) { - super(route); - lastVertexIsForkOrJoin = route.lastVertexIsForkOrJoin; - previousRouteWithLastVertexThatIsForkOrJoin = route.previousRouteWithLastVertexThatIsForkOrJoin; - } - - /** - * Create a new Route extending another one with an edge - * - * @param route the route to extend. - * @param edge the edge to extend the route with. - * - * @throws IllegalArgumentException if {@code route} or {@code edge} are {@code null}, or {@code edge} is - * not part of {@code route}'s graph, or {@code edge} does not have as a source the last vertex in {@code route}. - */ - public Route(final Route route, final E edge) { - super(route, edge); - lastVertexIsForkOrJoin = graph.outDegreeOf(route.lastVertex) > 1 || graph.inDegreeOf(lastVertex) > 1; - previousRouteWithLastVertexThatIsForkOrJoin = route.lastVertexIsForkOrJoin ? route : route.previousRouteWithLastVertexThatIsForkOrJoin; - } - - @Override - public boolean isSuffix(final Path other) { - if (other == this) - return true; - else if (other == null) - throw new IllegalArgumentException("other path must not be null"); - else if (getGraph() != other.getGraph()) - throw new IllegalArgumentException("other path must be part of the same graph"); - else if (other instanceof Route) - return isRouteSuffix((Route)other); - else - return super.isSuffix(other); - } - - @Override - public String toString() { - return super.toString().replace("Path{", "Route{"); - } - - /** - * Faster version when comparing with a route. - */ - protected boolean isRouteSuffix(final Route other) { - if (other.getGraph() != this.getGraph()) - throw new IllegalArgumentException("you cannot compare routes on different graphs"); - else if (lastVertex != other.lastVertex) // obvious case. - return false; - else if (this.previousRouteWithLastVertexThatIsForkOrJoin == null - && other.previousRouteWithLastVertexThatIsForkOrJoin != null) // I am shorter or different path for sure. - return false; - else if (this.edgesInOrder.size() < other.edgesInOrder.size()) // I am shorter regardless of path, no way Jose! - return false; - else if (this.previousRouteWithLastVertexThatIsForkOrJoin == null || other.previousRouteWithLastVertexThatIsForkOrJoin == null) { - final ListIterator myEdges = edgesInOrder.listIterator(edgesInOrder.size()); - final ListIterator otherEdges = other.edgesInOrder.listIterator(other.edgesInOrder.size()); - while (otherEdges.hasPrevious()) - if (myEdges.previous() != otherEdges.previous()) - return false; - return true; - } else - return (other.previousRouteWithLastVertexThatIsForkOrJoin == this.previousRouteWithLastVertexThatIsForkOrJoin) - || (previousRouteWithLastVertexThatIsForkOrJoin.lastVertex == other.previousRouteWithLastVertexThatIsForkOrJoin.lastVertex - && previousRouteWithLastVertexThatIsForkOrJoin.isRouteSuffix(other.previousRouteWithLastVertexThatIsForkOrJoin)); - } - - /** - * Checks whether the last vertex in the route is a fork or a joining vertex. - * @return {@code true} iff so. - */ - public boolean lastVertexIsForkOrJoin() { - return lastVertexIsForkOrJoin; - } - - /** - * Returns the longest prefix route that has as a last vertex a join or furcation vertex. - * - * @return never {@code null}. - */ - public Route getPrefixRouteWithLastVertexThatIsForkOrJoin() { - return previousRouteWithLastVertexThatIsForkOrJoin; - } - - - - /** - * Splice out the first few vertices of the route. - * - * @param length how many vertices to splice out - * @return a new route without those spliced vertices. - * - * @throws IllegalArgumentException if {@code length} is equal to the route's length or greater or if it is negative. - * Notice that non-vertex route are no legal routes. - */ - public Route splicePrefix(final int length) { - if (length == 0) - return this; - if (length >= length()) - throw new IllegalArgumentException("prefix slicing to long"); - if (length < 0) - throw new IllegalArgumentException("prefix cannot be negative"); - - final List resultEdges = getEdges().subList(length,length()); - Route result = new Route<>(graph.getEdgeSource(resultEdges.get(0)),this); - for (final E edge : resultEdges) - result = new Route<>(result,edge); - return result; - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java deleted file mode 100644 index c0848663e..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ /dev/null @@ -1,1611 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.indels; - -import net.sf.samtools.*; -import net.sf.samtools.util.RuntimeIOException; -import net.sf.samtools.util.SequenceUtil; -import net.sf.samtools.util.StringUtil; -import org.broad.tribble.Feature; -import org.broadinstitute.sting.commandline.*; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; -import org.broadinstitute.sting.gatk.iterators.ReadTransformer; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.BAQMode; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.smithwaterman.Parameters; -import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.baq.BAQ; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.StingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.sting.utils.sam.AlignmentUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.NWaySAMFileWriter; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.broadinstitute.sting.utils.text.TextFormattingUtils; -import org.broadinstitute.sting.utils.text.XReadLines; -import org.broadinstitute.variant.variantcontext.VariantContext; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileWriter; -import java.io.IOException; -import java.util.*; - -/** - * Performs local realignment of reads to correct misalignments due to the presence of indels. - * - *

- * The local realignment tool is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases - * is minimized across all the reads. In general, a large percent of regions requiring local realignment are due to the presence of an insertion - * or deletion (indels) in the individual's genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching - * the reference near the misalignment, which are easily mistaken as SNPs. Moreover, since read mapping algorithms operate on each read independently, - * it is impossible to place reads on the reference genome such at mismatches are minimized across all reads. Consequently, even when some reads are - * correctly mapped with indels, reads covering the indel near just the start or end of the read are often incorrectly mapped with respect the true indel, - * also requiring realignment. Local realignment serves to transform regions with misalignments due to indels into clean reads containing a consensus - * indel suitable for standard variant discovery approaches. Unlike most mappers, this walker uses the full alignment context to determine whether an - * appropriate alternate reference (i.e. indel) exists. Following local realignment, the GATK tool Unified Genotyper can be used to sensitively and - * specifically identify indels. - *

- *
    There are 2 steps to the realignment process: - *
  1. Determining (small) suspicious intervals which are likely in need of realignment (see the RealignerTargetCreator tool)
  2. - *
  3. Running the realigner over those intervals (IndelRealigner)
  4. - *
- *

- * For more details, see http://www.broadinstitute.org/gatk/guide/article?id=38 - *

- * - *

Input

- *

- * One or more aligned BAM files and optionally one or more lists of known indels. - *

- * - *

Output

- *

- * A realigned version of your input BAM file(s). - *

- * - *

Example

- *
- * java -Xmx4g -jar GenomeAnalysisTK.jar \
- *   -T IndelRealigner \
- *   -R ref.fasta \
- *   -I input.bam \
- *   -targetIntervals intervalListFromRTC.intervals \
- *   -o realignedBam.bam \
- *   [-known /path/to/indels.vcf] \
- *   [-compress 0]    (this argument recommended to speed up the process *if* this is only a temporary file; otherwise, use the default value)
- * 
- * - *

Caveats

- * - *
  • - * An important note: the input bam(s), reference, and known indel file(s) should be the same ones used for the RealignerTargetCreator step. - *
  • - * Another important note: because reads produced from the 454 technology inherently contain false indels, the realigner will not currently work with them - * (or with reads from similar technologies). - *
- * - * @author ebanks - */ -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_DATA, extraDocs = {CommandLineGATK.class} ) -@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = ReadTransformer.ApplicationTime.ON_OUTPUT) -public class IndelRealigner extends ReadWalker { - - public static final String ORIGINAL_CIGAR_TAG = "OC"; - public static final String ORIGINAL_POSITION_TAG = "OP"; - public static final String PROGRAM_RECORD_NAME = "GATK IndelRealigner"; - - public enum ConsensusDeterminationModel { - /** - * Uses only indels from a provided ROD of known indels. - */ - KNOWNS_ONLY, - /** - * Additionally uses indels already present in the original alignments of the reads. - */ - USE_READS, - /** - * Additionally uses 'Smith-Waterman' to generate alternate consenses. - */ - USE_SW - } - - /** - * Any number of VCF files representing known indels to be used for constructing alternate consenses. - * Could be e.g. dbSNP and/or official 1000 Genomes indel calls. Non-indel variants in these files will be ignored. - */ - @Input(fullName="knownAlleles", shortName = "known", doc="Input VCF file(s) with known indels", required=false) - public List> known = Collections.emptyList(); - - /** - * The interval list output from the RealignerTargetCreator tool using the same bam(s), reference, and known indel file(s). - */ - @Input(fullName="targetIntervals", shortName="targetIntervals", doc="Intervals file output from RealignerTargetCreator", required=true) - protected IntervalBinding intervalsFile = null; - - /** - * This term is equivalent to "significance" - i.e. is the improvement significant enough to merit realignment? Note that this number - * should be adjusted based on your particular data set. For low coverage and/or when looking for indels with low allele frequency, - * this number should be smaller. - */ - @Argument(fullName="LODThresholdForCleaning", shortName="LOD", doc="LOD threshold above which the cleaner will clean", required=false) - protected double LOD_THRESHOLD = 5.0; - - /** - * The realigned bam file. - */ - @Output(required=false, doc="Output bam", defaultToStdout=false) - protected StingSAMFileWriter writer = null; - protected ConstrainedMateFixingManager manager = null; - protected SAMFileWriter writerToUse = null; - - /** - * We recommend that users run with USE_READS when trying to realign high quality longer read data mapped with a gapped aligner; - * Smith-Waterman is really only necessary when using an ungapped aligner (e.g. MAQ in the case of single-end read data). - */ - @Argument(fullName = "consensusDeterminationModel", shortName = "model", doc = "Determines how to compute the possible alternate consenses", required = false) - public ConsensusDeterminationModel consensusModel = ConsensusDeterminationModel.USE_READS; - - - // ADVANCED OPTIONS FOLLOW - - /** - * For expert users only! This is similar to the argument in the RealignerTargetCreator walker. The point here is that the realigner - * will only proceed with the realignment (even above the given threshold) if it minimizes entropy among the reads (and doesn't simply - * push the mismatch column to another position). This parameter is just a heuristic and should be adjusted based on your particular data set. - */ - @Advanced - @Argument(fullName="entropyThreshold", shortName="entropy", doc="Percentage of mismatches at a locus to be considered having high entropy (0.0 < entropy <= 1.0)", required=false) - protected double MISMATCH_THRESHOLD = 0.15; - - /** - * For expert users only! To minimize memory consumption you can lower this number (but then the tool may skip realignment on regions with too much coverage; - * and if the number is too low, it may generate errors during realignment). Just make sure to give Java enough memory! 4Gb should be enough with the default value. - */ - @Advanced - @Argument(fullName="maxReadsInMemory", shortName="maxInMemory", doc="max reads allowed to be kept in memory at a time by the SAMFileWriter", required=false) - protected int MAX_RECORDS_IN_MEMORY = 150000; - - /** - * For expert users only! - */ - @Advanced - @Argument(fullName="maxIsizeForMovement", shortName="maxIsize", doc="maximum insert size of read pairs that we attempt to realign", required=false) - protected int MAX_ISIZE_FOR_MOVEMENT = 3000; - - /** - * For expert users only! - */ - @Advanced - @Argument(fullName="maxPositionalMoveAllowed", shortName="maxPosMove", doc="Maximum positional move in basepairs that a read can be adjusted during realignment", required=false) - protected int MAX_POS_MOVE_ALLOWED = 200; - - /** - * For expert users only! If you need to find the optimal solution regardless of running time, use a higher number. - */ - @Advanced - @Argument(fullName="maxConsensuses", shortName="maxConsensuses", doc="Max alternate consensuses to try (necessary to improve performance in deep coverage)", required=false) - protected int MAX_CONSENSUSES = 30; - - /** - * For expert users only! If you need to find the optimal solution regardless of running time, use a higher number. - */ - @Advanced - @Argument(fullName="maxReadsForConsensuses", shortName="greedy", doc="Max reads used for finding the alternate consensuses (necessary to improve performance in deep coverage)", required=false) - protected int MAX_READS_FOR_CONSENSUSES = 120; - - /** - * For expert users only! If this value is exceeded at a given interval, realignment is not attempted and the reads are passed to the output file(s) as-is. - * If you need to allow more reads (e.g. with very deep coverage) regardless of memory, use a higher number. - */ - @Advanced - @Argument(fullName="maxReadsForRealignment", shortName="maxReads", doc="Max reads allowed at an interval for realignment", required=false) - protected int MAX_READS = 20000; - - @Advanced - @Argument(fullName="noOriginalAlignmentTags", shortName="noTags", required=false, doc="Don't output the original cigar or alignment start tags for each realigned read in the output bam") - protected boolean NO_ORIGINAL_ALIGNMENT_TAGS = false; - - /** - * Reads from all input files will be realigned together, but then each read will be saved in the output file corresponding to the input file that - * the read came from. There are two ways to generate output bam file names: 1) if the value of this argument is a general string (e.g. '.cleaned.bam'), - * then extensions (".bam" or ".sam") will be stripped from the input file names and the provided string value will be pasted on instead; 2) if the - * value ends with a '.map' (e.g. input_output.map), then the two-column tab-separated file with the specified name must exist and list unique output - * file name (2nd column) for each input file name (1st column). - * - * Note that some GATK arguments do NOT work in conjunction with nWayOut (e.g. --disable_bam_indexing). - */ - @Argument(fullName="nWayOut", shortName="nWayOut", required=false, doc="Generate one output file for each input (-I) bam file (not compatible with -output)") - protected String N_WAY_OUT = null; - - @Hidden - @Argument(fullName="generate_nWayOut_md5s",doc="Generate md5sums for BAMs") - protected boolean generateMD5s = false; - - // DEBUGGING OPTIONS FOLLOW - - @Hidden - @Argument(fullName="check_early",shortName="check_early",required=false,doc="Do early check of reads against existing consensuses") - protected boolean CHECKEARLY = false; - - @Hidden - @Argument(fullName="noPGTag", shortName="noPG", required=false, - doc="Don't output the usual PG tag in the realigned bam file header. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.") - protected boolean NO_PG_TAG = false; - - @Hidden - @Argument(fullName="keepPGTags", shortName="keepPG", required=false, - doc="Keep older PG tags left in the bam header by previous runs of this tool (by default, all these "+ - "historical tags will be replaced by the latest tag generated in the current run).") - protected boolean KEEP_ALL_PG_RECORDS = false; - - @Hidden - @Output(fullName="indelsFileForDebugging", shortName="indels", required=false, defaultToStdout=false, doc="Output file (text) for the indels found; FOR DEBUGGING PURPOSES ONLY") - protected String OUT_INDELS = null; - - @Hidden - @Output(fullName="statisticsFileForDebugging", shortName="stats", doc="print out statistics (what does or doesn't get cleaned); FOR DEBUGGING PURPOSES ONLY", required=false, defaultToStdout=false) - protected String OUT_STATS = null; - - @Hidden - @Output(fullName="SNPsFileForDebugging", shortName="snps", doc="print out whether mismatching columns do or don't get cleaned out; FOR DEBUGGING PURPOSES ONLY", required=false, defaultToStdout=false) - protected String OUT_SNPS = null; - - // fasta reference reader to supplement the edges of the reference sequence - private CachingIndexedFastaSequenceFile referenceReader; - - // the intervals input by the user - private Iterator intervals = null; - - // the current interval in the list - private GenomeLoc currentInterval = null; - private boolean sawReadInCurrentInterval = false; - - // the reads and known indels that fall into the current interval - private ReadBin readsToClean; - private final ArrayList readsNotToClean = new ArrayList(); - private final ArrayList knownIndelsToTry = new ArrayList(); - private final HashSet indelRodsSeen = new HashSet(); - private final HashSet readsActuallyCleaned = new HashSet(); - - private static final int MAX_QUAL = 99; - - // fraction of mismatches that need to no longer mismatch for a column to be considered cleaned - private static final double MISMATCH_COLUMN_CLEANED_FRACTION = 0.75; - - private final static Parameters swParameters = new Parameters(30.0, -10.0, -10.0, -2.0); - - // reference base padding size - // TODO -- make this a command-line argument if the need arises - private static final int REFERENCE_PADDING = 30; - - // other output files - private FileWriter indelOutput = null; - private FileWriter statsOutput = null; - private FileWriter snpsOutput = null; - - //###protected Map nwayWriters = null; - - - // debug info for lazy SW evaluation: - private long exactMatchesFound = 0; // how many reads exactly matched a consensus we already had - private long SWalignmentRuns = 0; // how many times (=for how many reads) we ran SW alignment - private long SWalignmentSuccess = 0; // how many SW alignments were "successful" (i.e. found a workable indel and resulted in non-null consensus) - - private Map loadFileNameMap(String mapFile) { - Map fname_map = new HashMap(); - - try { - - XReadLines reader = new XReadLines(new File(mapFile),true); - for ( String line : reader ) { - if ( line.length() == 0 ) continue; - - String fields[] = line.split("\t"); - - if ( fields.length != 2 ) - throw new UserException.BadInput("Input-output map file must have exactly two columns. Offending line:\n"+line); - if ( fields[0].length() == 0 || fields[1].length() == 0 ) - throw new UserException.BadInput("Input-output map file can not have empty strings in either column. Offending line:\n"+line); - - if ( fname_map.containsKey(fields[0]) ) - throw new UserException.BadInput("Input-output map file contains duplicate entries for input name "+fields[0]); - if ( fname_map.containsValue(fields[1]) ) - throw new UserException.BadInput("Input-output map file maps multiple entries onto single output name "+fields[1]); - - fname_map.put(fields[0],fields[1]); - } - } catch (IOException e) { - throw new StingException("I/O Error while reading input-output map file "+N_WAY_OUT+": "+e.getMessage()); - } - return fname_map; - } - - public void initialize() { - readsToClean = new ReadBin(getToolkit().getGenomeLocParser(), REFERENCE_PADDING); - - if ( N_WAY_OUT == null && writer == null ) { - throw new UserException.CommandLineException("Either -o or -nWayOut must be specified"); - } - if ( N_WAY_OUT != null && writer != null ) { - throw new UserException.CommandLineException("-o and -nWayOut can not be used simultaneously"); - } - if ( LOD_THRESHOLD < 0.0 ) - throw new RuntimeException("LOD threshold cannot be a negative number"); - if ( MISMATCH_THRESHOLD <= 0.0 || MISMATCH_THRESHOLD > 1.0 ) - throw new RuntimeException("Entropy threshold must be a fraction between 0 and 1"); - - try { - referenceReader = new CachingIndexedFastaSequenceFile(getToolkit().getArguments().referenceFile); - } - catch(FileNotFoundException ex) { - throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile,ex); - } - - intervals = intervalsFile.getIntervals(getToolkit()).iterator(); - - currentInterval = intervals.hasNext() ? intervals.next() : null; - - if ( N_WAY_OUT != null ) { - boolean createIndex = true; - - if ( N_WAY_OUT.toUpperCase().endsWith(".MAP") ) { - writerToUse = new NWaySAMFileWriter(getToolkit(),loadFileNameMap(N_WAY_OUT), - SAMFileHeader.SortOrder.coordinate,true, createIndex, generateMD5s,createProgramRecord(),KEEP_ALL_PG_RECORDS); - } else { - writerToUse = new NWaySAMFileWriter(getToolkit(),N_WAY_OUT,SAMFileHeader.SortOrder.coordinate,true, - createIndex, generateMD5s,createProgramRecord(),KEEP_ALL_PG_RECORDS); - } - } else { - // set up the output writer - setupWriter(getToolkit().getSAMFileHeader()); - writerToUse = writer; - } - manager = new ConstrainedMateFixingManager(writerToUse, getToolkit().getGenomeLocParser(), MAX_ISIZE_FOR_MOVEMENT, MAX_POS_MOVE_ALLOWED, MAX_RECORDS_IN_MEMORY); - - if ( OUT_INDELS != null ) { - try { - indelOutput = new FileWriter(new File(OUT_INDELS)); - } catch (Exception e) { - logger.error("Failed to create output file "+ OUT_INDELS+". Indel output will be suppressed"); - logger.error(e.getMessage()); - indelOutput = null; - } - } - if ( OUT_STATS != null ) { - try { - statsOutput = new FileWriter(new File(OUT_STATS)); - } catch (Exception e) { - logger.error("Failed to create output file "+ OUT_STATS+". Cleaning stats output will be suppressed"); - logger.error(e.getMessage()); - statsOutput = null; - } - } - if ( OUT_SNPS != null ) { - try { - snpsOutput = new FileWriter(new File(OUT_SNPS)); - } catch (Exception e) { - logger.error("Failed to create output file "+ OUT_SNPS+". Cleaning snps output will be suppressed"); - logger.error(e.getMessage()); - snpsOutput = null; - } - } - } - - private void setupWriter(SAMFileHeader header) { - - if ( !NO_PG_TAG ) { - final SAMProgramRecord programRecord = createProgramRecord(); - - List oldRecords = header.getProgramRecords(); - List newRecords = new ArrayList(oldRecords.size()+1); - for ( SAMProgramRecord record : oldRecords ) { - if ( !record.getId().startsWith(PROGRAM_RECORD_NAME) || KEEP_ALL_PG_RECORDS ) - newRecords.add(record); - } - newRecords.add(programRecord); - header.setProgramRecords(newRecords); - } - - writer.writeHeader(header); - writer.setPresorted(true); - } - - - private SAMProgramRecord createProgramRecord() { - if ( NO_PG_TAG ) return null; - - final SAMProgramRecord programRecord = new SAMProgramRecord(PROGRAM_RECORD_NAME); - final ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("StingText"); - try { - final String version = headerInfo.getString("org.broadinstitute.sting.gatk.version"); - programRecord.setProgramVersion(version); - } catch (MissingResourceException e) { - // this is left empty on purpose (perhaps Andrey knows why?) - } - programRecord.setCommandLine(getToolkit().createApproximateCommandLineArgumentString(getToolkit(), this)); - return programRecord; - } - - private void emit(final GATKSAMRecord read) { - - // check to see whether the read was modified by looking at the temporary tag - boolean wasModified = readsActuallyCleaned.contains(read); - - try { - manager.addRead(read, wasModified); - } catch (RuntimeIOException e) { - throw new UserException.ErrorWritingBamFile(e.getMessage()); - } - } - - private void emitReadLists() { - // pre-merge lists to sort them in preparation for constrained SAMFileWriter - readsNotToClean.addAll(readsToClean.getReads()); - ReadUtils.sortReadsByCoordinate(readsNotToClean); - manager.addReads(readsNotToClean, readsActuallyCleaned); - readsToClean.clear(); - readsNotToClean.clear(); - readsActuallyCleaned.clear(); - } - - public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) { - if ( currentInterval == null ) { - emit(read); - return 0; - } - - // edge case: when the last target interval abuts the end of the genome, we'll get one of the - // unmapped reads while the currentInterval still isn't null. We need to trigger the cleaning - // at this point without trying to create a GenomeLoc. - if ( read.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX ) { - cleanAndCallMap(ref, read, metaDataTracker, null); - return 0; - } - - GenomeLoc readLoc = getToolkit().getGenomeLocParser().createGenomeLoc(read); - // hack to get around unmapped reads having screwy locations - if ( readLoc.getStop() == 0 ) - readLoc = getToolkit().getGenomeLocParser().createGenomeLoc(readLoc.getContig(), readLoc.getStart(), readLoc.getStart()); - - if ( readLoc.isBefore(currentInterval) ) { - if ( !sawReadInCurrentInterval ) - emit(read); - else - readsNotToClean.add(read); - } - else if ( readLoc.overlapsP(currentInterval) ) { - sawReadInCurrentInterval = true; - - if ( doNotTryToClean(read) ) { - readsNotToClean.add(read); - } else { - readsToClean.add(read); - - // add the rods to the list of known variants - populateKnownIndels(metaDataTracker); - } - - if ( readsToClean.size() + readsNotToClean.size() >= MAX_READS ) { - logger.info("Not attempting realignment in interval " + currentInterval + " because there are too many reads."); - abortCleanForCurrentInterval(); - } - } - else { // the read is past the current interval - logger.debug(currentInterval.toString() + "\t" + read.getAlignmentStart() ); - cleanAndCallMap(ref, read, metaDataTracker, readLoc); - } - - return 0; - } - - private void abortCleanForCurrentInterval() { - emitReadLists(); - currentInterval = intervals.hasNext() ? intervals.next() : null; - sawReadInCurrentInterval = false; - } - - private boolean doNotTryToClean(GATKSAMRecord read) { - return read.getReadUnmappedFlag() || - read.getNotPrimaryAlignmentFlag() || - read.getReadFailsVendorQualityCheckFlag() || - read.getMappingQuality() == 0 || - read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START || - ConstrainedMateFixingManager.iSizeTooBigToMove(read, MAX_ISIZE_FOR_MOVEMENT) || - ReadUtils.is454Read(read) || - ReadUtils.isIonRead(read); - // TODO -- it would be nice if we could use indels from 454/Ion reads as alternate consenses - } - - private void cleanAndCallMap(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker, GenomeLoc readLoc) { - if ( readsToClean.size() > 0 ) { - GenomeLoc earliestPossibleMove = getToolkit().getGenomeLocParser().createGenomeLoc(readsToClean.getReads().get(0)); - if ( manager.canMoveReads(earliestPossibleMove) ) - clean(readsToClean); - } - knownIndelsToTry.clear(); - indelRodsSeen.clear(); - - emitReadLists(); - try { - do { - currentInterval = intervals.hasNext() ? intervals.next() : null; - - } while ( currentInterval != null && (readLoc == null || currentInterval.isBefore(readLoc)) ); - } catch (ReviewedStingException e) { - throw new UserException.MissortedFile(new File(intervalsFile.getSource()), " *** Are you sure that your interval file is sorted? If not, you must use the --targetIntervalsAreNotSorted argument. ***", e); - } - sawReadInCurrentInterval = false; - - // call back into map now that the state has been updated - map(ref, read, metaDataTracker); - } - - public Integer reduceInit() { - return 0; - } - - public Integer reduce(Integer value, Integer sum) { - return sum + value; - } - - public void onTraversalDone(Integer result) { - if ( readsToClean.size() > 0 ) { - GenomeLoc earliestPossibleMove = getToolkit().getGenomeLocParser().createGenomeLoc(readsToClean.getReads().get(0)); - if ( manager.canMoveReads(earliestPossibleMove) ) - clean(readsToClean); - emitReadLists(); - } else if ( readsNotToClean.size() > 0 ) { - emitReadLists(); - } - - knownIndelsToTry.clear(); - indelRodsSeen.clear(); - - if ( OUT_INDELS != null ) { - try { - indelOutput.close(); - } catch (Exception e) { - logger.error("Failed to close "+OUT_INDELS+" gracefully. Data may be corrupt."); - } - } - if ( OUT_STATS != null ) { - try { - statsOutput.close(); - } catch (Exception e) { - logger.error("Failed to close "+OUT_STATS+" gracefully. Data may be corrupt."); - } - } - if ( OUT_SNPS != null ) { - try { - snpsOutput.close(); - } catch (Exception e) { - logger.error("Failed to close "+OUT_SNPS+" gracefully. Data may be corrupt."); - } - } - - manager.close(); - if ( N_WAY_OUT != null ) writerToUse.close(); - - if ( CHECKEARLY ) { - logger.info("SW alignments runs: "+SWalignmentRuns); - logger.info("SW alignments successfull: "+SWalignmentSuccess + " ("+SWalignmentSuccess/SWalignmentRuns+"% of SW runs)"); - logger.info("SW alignments skipped (perfect match): "+exactMatchesFound); - logger.info("Total reads SW worked for: "+(SWalignmentSuccess + exactMatchesFound)+ - " ("+(SWalignmentSuccess+exactMatchesFound)/(SWalignmentRuns+exactMatchesFound)+"% of all reads requiring SW)"); - } - } - - private void populateKnownIndels(RefMetaDataTracker metaDataTracker) { - for ( final VariantContext vc : metaDataTracker.getValues(known) ) { - if ( indelRodsSeen.contains(vc) ) - continue; - indelRodsSeen.add(vc); - knownIndelsToTry.add(vc); - } - } - - private static int mismatchQualitySumIgnoreCigar(final AlignedRead aRead, final byte[] refSeq, int refIndex, int quitAboveThisValue) { - final byte[] readSeq = aRead.getReadBases(); - final byte[] quals = aRead.getBaseQualities(); - int sum = 0; - for (int readIndex = 0 ; readIndex < readSeq.length ; refIndex++, readIndex++ ) { - if ( refIndex >= refSeq.length ) { - sum += MAX_QUAL; - // optimization: once we pass the threshold, stop calculating - if ( sum > quitAboveThisValue ) - return sum; - } else { - byte refChr = refSeq[refIndex]; - byte readChr = readSeq[readIndex]; - if ( !BaseUtils.isRegularBase(readChr) || !BaseUtils.isRegularBase(refChr) ) - continue; // do not count Ns/Xs/etc ? - if ( readChr != refChr ) { - sum += (int)quals[readIndex]; - // optimization: once we pass the threshold, stop calculating - if ( sum > quitAboveThisValue ) - return sum; - } - } - } - return sum; - } - - private void clean(ReadBin readsToClean) { - - final List reads = readsToClean.getReads(); - if ( reads.size() == 0 ) - return; - - byte[] reference = readsToClean.getReference(referenceReader); - int leftmostIndex = readsToClean.getLocation().getStart(); - - final ArrayList refReads = new ArrayList(); // reads that perfectly match ref - final ArrayList altReads = new ArrayList(); // reads that don't perfectly match - final LinkedList altAlignmentsToTest = new LinkedList(); // should we try to make an alt consensus from the read? - final Set altConsenses = new LinkedHashSet(); // list of alt consenses - - // if there are any known indels for this region, get them and create alternate consenses - generateAlternateConsensesFromKnownIndels(altConsenses, leftmostIndex, reference); - - // decide which reads potentially need to be cleaned; - // if there are reads with a single indel in them, add that indel to the list of alternate consenses - long totalRawMismatchSum = determineReadsThatNeedCleaning(reads, refReads, altReads, altAlignmentsToTest, altConsenses, leftmostIndex, reference); - - // use 'Smith-Waterman' to create alternate consenses from reads that mismatch the reference, using totalRawMismatchSum as the random seed - if ( consensusModel == ConsensusDeterminationModel.USE_SW ) - generateAlternateConsensesFromReads(altAlignmentsToTest, altConsenses, reference, leftmostIndex); - - // if ( debugOn ) System.out.println("------\nChecking consenses...\n--------\n"); - - Consensus bestConsensus = null; - - for (Consensus consensus : altConsenses) { - //logger.debug("Trying new consensus: " + consensus.cigar + " " + new String(consensus.str)); - -// if ( DEBUG ) { -// System.out.println("Checking consensus with alignment at "+consensus.positionOnReference+" cigar "+consensus.cigar); -// System.out.println(new String(consensus.str)); -// int z = 0; -// for ( ; z < consensus.positionOnReference; z++ ) System.out.print('.'); -// for ( z=0 ; z < consensus.cigar.getCigarElement(0).getLength() ; z++ ) System.out.print('.'); -// if ( consensus.cigar.getCigarElement(1).getOperator() == CigarOperator.I ) for ( z= 0; z < consensus.cigar.getCigarElement(1).getLength(); z++ ) System.out.print('I'); -// System.out.println(); -// } - - // if ( debugOn ) System.out.println("Consensus: "+consensus.str); - - for (int j = 0; j < altReads.size(); j++) { - AlignedRead toTest = altReads.get(j); - Pair altAlignment = findBestOffset(consensus.str, toTest, leftmostIndex); - - // the mismatch score is the min of its alignment vs. the reference and vs. the alternate - int myScore = altAlignment.second; - - if (myScore > toTest.getAlignerMismatchScore() || myScore >= toTest.getMismatchScoreToReference()) - myScore = toTest.getMismatchScoreToReference(); - // keep track of reads that align better to the alternate consensus. - // By pushing alignments with equal scores to the alternate, it means we'll over-call (het -> hom non ref) but are less likely to under-call (het -> ref, het non ref -> het) - else - consensus.readIndexes.add(new Pair(j, altAlignment.first)); - - //logger.debug(consensus.cigar + " vs. " + toTest.getRead().getReadName() + "-" + toTest.getRead().getReadString() + " => " + myScore + " vs. " + toTest.getMismatchScoreToReference()); - if (!toTest.getRead().getDuplicateReadFlag()) - consensus.mismatchSum += myScore; - - // optimization: once the mismatch sum is higher than the best consensus, quit since this one can't win - // THIS MUST BE DISABLED IF WE DECIDE TO ALLOW MORE THAN ONE ALTERNATE CONSENSUS! - if (bestConsensus != null && consensus.mismatchSum > bestConsensus.mismatchSum) - break; - } - - //logger.debug("Mismatch sum of new consensus: " + consensus.mismatchSum); - if (bestConsensus == null || bestConsensus.mismatchSum > consensus.mismatchSum) { - // we do not need this alt consensus, release memory right away!! - if (bestConsensus != null) - bestConsensus.readIndexes.clear(); - bestConsensus = consensus; - //logger.debug("New consensus " + bestConsensus.cigar + " is now best consensus"); - } else { - // we do not need this alt consensus, release memory right away!! - consensus.readIndexes.clear(); - } - } - - // if: - // 1) the best alternate consensus has a smaller sum of quality score mismatches than the aligned version of the reads, - // 2) beats the LOD threshold for the sum of quality score mismatches of the raw version of the reads, - // 3) didn't just move around the mismatching columns (i.e. it actually reduces entropy), - // then clean! - final double improvement = (bestConsensus == null ? -1 : ((double)(totalRawMismatchSum - bestConsensus.mismatchSum))/10.0); - if ( improvement >= LOD_THRESHOLD ) { - - bestConsensus.cigar = AlignmentUtils.leftAlignIndel(bestConsensus.cigar, reference, bestConsensus.str, bestConsensus.positionOnReference, bestConsensus.positionOnReference, true); - - // start cleaning the appropriate reads - for ( Pair indexPair : bestConsensus.readIndexes ) { - AlignedRead aRead = altReads.get(indexPair.first); - if ( !updateRead(bestConsensus.cigar, bestConsensus.positionOnReference, indexPair.second, aRead, leftmostIndex) ) - return; - } - if ( consensusModel != ConsensusDeterminationModel.KNOWNS_ONLY && !alternateReducesEntropy(altReads, reference, leftmostIndex) ) { - if ( statsOutput != null ) { - try { - statsOutput.write(currentInterval.toString()); - statsOutput.write("\tFAIL (bad indel)\t"); // if improvement > LOD_THRESHOLD *BUT* entropy is not reduced (SNPs still exist) - statsOutput.write(Double.toString(improvement)); - statsOutput.write("\n"); - statsOutput.flush(); - } catch (Exception e) { - throw new UserException.CouldNotCreateOutputFile("statsOutput", "Failed to write stats output file", e); - } - } - } else { - //logger.debug("CLEAN: " + bestConsensus.cigar + " " + bestConsensus.str.toString() + " " + bestConsensus.cigar.numCigarElements() ); - if ( indelOutput != null && bestConsensus.cigar.numCigarElements() > 1 ) { - // NOTE: indels are printed out in the format specified for the low-coverage pilot1 - // indel calls (tab-delimited): chr position size type sequence - StringBuilder str = new StringBuilder(); - str.append(reads.get(0).getReferenceName()); - int position = bestConsensus.positionOnReference + bestConsensus.cigar.getCigarElement(0).getLength(); - str.append("\t").append(leftmostIndex + position - 1); - CigarElement ce = bestConsensus.cigar.getCigarElement(1); - str.append("\t").append(ce.getLength()).append("\t").append(ce.getOperator()).append("\t"); - int length = ce.getLength(); - if ( ce.getOperator() == CigarOperator.D ) { - for ( int i = 0; i < length; i++) - str.append((char)reference[position+i]); - } else { - for ( int i = 0; i < length; i++) - str.append((char)bestConsensus.str[position+i]); - } - str.append("\t").append((((double) (totalRawMismatchSum - bestConsensus.mismatchSum)) / 10.0)).append("\n"); - try { - indelOutput.write(str.toString()); - indelOutput.flush(); - } catch (Exception e) { - throw new UserException.CouldNotCreateOutputFile("indelOutput", "Failed to write indel output file", e); - } - } - if ( statsOutput != null ) { - try { - statsOutput.write(currentInterval.toString()); - statsOutput.write("\tCLEAN"); // if improvement > LOD_THRESHOLD *AND* entropy is reduced - if ( bestConsensus.cigar.numCigarElements() > 1 ) - statsOutput.write(" (found indel)"); - statsOutput.write("\t"); - statsOutput.write(Double.toString(improvement)); - statsOutput.write("\n"); - statsOutput.flush(); - } catch (Exception e) { - throw new UserException.CouldNotCreateOutputFile("statsOutput", "Failed to write stats output file", e); - } - } - - // finish cleaning the appropriate reads - for ( Pair indexPair : bestConsensus.readIndexes ) { - final AlignedRead aRead = altReads.get(indexPair.first); - if ( aRead.finalizeUpdate() ) { - // We need to update the mapping quality score of the cleaned reads; - // however we don't have enough info to use the proper MAQ scoring system. - // For now, we will just arbitrarily add 10 to the mapping quality. [EB, 6/7/2010]. - // TODO -- we need a better solution here - GATKSAMRecord read = aRead.getRead(); - if ( read.getMappingQuality() != 255 ) // 255 == Unknown, so don't modify it - read.setMappingQuality(Math.min(aRead.getRead().getMappingQuality() + 10, 254)); - - // before we fix the attribute tags we first need to make sure we have enough of the reference sequence - int neededBasesToLeft = leftmostIndex - read.getAlignmentStart(); - int neededBasesToRight = read.getAlignmentEnd() - leftmostIndex - reference.length + 1; - int neededBases = Math.max(neededBasesToLeft, neededBasesToRight); - if ( neededBases > 0 ) { - int padLeft = Math.max(leftmostIndex-neededBases, 1); - int padRight = Math.min(leftmostIndex+reference.length+neededBases, referenceReader.getSequenceDictionary().getSequence(currentInterval.getContig()).getSequenceLength()); - reference = referenceReader.getSubsequenceAt(currentInterval.getContig(), padLeft, padRight).getBases(); - leftmostIndex = padLeft; - } - - // now, fix the attribute tags - // TODO -- get rid of this try block when Picard does the right thing for reads aligned off the end of the reference - try { - if ( read.getAttribute(SAMTag.NM.name()) != null ) - read.setAttribute(SAMTag.NM.name(), SequenceUtil.calculateSamNmTag(read, reference, leftmostIndex - 1)); - if ( read.getAttribute(SAMTag.UQ.name()) != null ) - read.setAttribute(SAMTag.UQ.name(), SequenceUtil.sumQualitiesOfMismatches(read, reference, leftmostIndex-1)); - } catch (Exception e) { - // ignore it - } - // TODO -- this is only temporary until Tim adds code to recalculate this value - if ( read.getAttribute(SAMTag.MD.name()) != null ) - read.setAttribute(SAMTag.MD.name(), null); - - // mark that it was actually cleaned - readsActuallyCleaned.add(read); - } - } - } - - // END IF ( improvement >= LOD_THRESHOLD ) - - } else if ( statsOutput != null ) { - try { - statsOutput.write(String.format("%s\tFAIL\t%.1f%n", - currentInterval.toString(), improvement)); - statsOutput.flush(); - } catch (Exception e) { - throw new UserException.CouldNotCreateOutputFile("statsOutput", "Failed to write stats output file", e); - } - } - } - - private void generateAlternateConsensesFromKnownIndels(final Set altConsensesToPopulate, final int leftmostIndex, final byte[] reference) { - for ( VariantContext knownIndel : knownIndelsToTry ) { - if ( knownIndel == null || !knownIndel.isIndel() || knownIndel.isComplexIndel() ) - continue; - final byte[] indelStr; - if ( knownIndel.isSimpleInsertion() ) { - final byte[] fullAllele = knownIndel.getAlternateAllele(0).getBases(); - indelStr = Arrays.copyOfRange(fullAllele, 1, fullAllele.length); // remove ref padding - } else { - indelStr = Utils.dupBytes((byte)'-', knownIndel.getReference().length() - 1); - } - int start = knownIndel.getStart() - leftmostIndex + 1; - Consensus c = createAlternateConsensus(start, reference, indelStr, knownIndel); - if ( c != null ) - altConsensesToPopulate.add(c); - } - } - - private long determineReadsThatNeedCleaning(final List reads, - final ArrayList refReadsToPopulate, - final ArrayList altReadsToPopulate, - final LinkedList altAlignmentsToTest, - final Set altConsenses, - final int leftmostIndex, - final byte[] reference) { - - long totalRawMismatchSum = 0L; - for ( final GATKSAMRecord read : reads ) { - - // we can not deal with screwy records - if ( read.getCigar().numCigarElements() == 0 ) { - refReadsToPopulate.add(read); - continue; - } - - final AlignedRead aRead = new AlignedRead(read); - - // first, move existing indels (for 1 indel reads only) to leftmost position within identical sequence - int numBlocks = AlignmentUtils.getNumAlignmentBlocks(read); - if ( numBlocks == 2 ) { - Cigar newCigar = AlignmentUtils.leftAlignIndel(unclipCigar(read.getCigar()), reference, read.getReadBases(), read.getAlignmentStart()-leftmostIndex, 0, true); - aRead.setCigar(newCigar, false); - } - - final int startOnRef = read.getAlignmentStart()-leftmostIndex; - final int rawMismatchScore = mismatchQualitySumIgnoreCigar(aRead, reference, startOnRef, Integer.MAX_VALUE); - - // if this doesn't match perfectly to the reference, let's try to clean it - if ( rawMismatchScore > 0 ) { - altReadsToPopulate.add(aRead); - //logger.debug("Adding " + read.getReadName() + " with raw mismatch score " + rawMismatchScore + " to non-ref reads"); - - if ( !read.getDuplicateReadFlag() ) - totalRawMismatchSum += rawMismatchScore; - aRead.setMismatchScoreToReference(rawMismatchScore); - aRead.setAlignerMismatchScore(AlignmentUtils.mismatchingQualities(aRead.getRead(), reference, startOnRef)); - - // if it has an indel, let's see if that's the best consensus - if ( consensusModel != ConsensusDeterminationModel.KNOWNS_ONLY && numBlocks == 2 ) { - Consensus c = createAlternateConsensus(startOnRef, aRead.getCigar(), reference, aRead.getReadBases()); - if ( c != null ) - altConsenses.add(c); - } else { - altAlignmentsToTest.add(aRead); - } - } - // otherwise, we can emit it as is - else { - //logger.debug("Adding " + read.getReadName() + " with raw mismatch score " + rawMismatchScore + " to ref reads"); - refReadsToPopulate.add(read); - } - } - - return totalRawMismatchSum; - } - - private void generateAlternateConsensesFromReads(final LinkedList altAlignmentsToTest, - final Set altConsensesToPopulate, - final byte[] reference, - final int leftmostIndex) { - - // if we are under the limit, use all reads to generate alternate consenses - if ( altAlignmentsToTest.size() <= MAX_READS_FOR_CONSENSUSES ) { - for ( AlignedRead aRead : altAlignmentsToTest ) { - if ( CHECKEARLY ) createAndAddAlternateConsensus1(aRead, altConsensesToPopulate, reference,leftmostIndex); - else createAndAddAlternateConsensus(aRead.getReadBases(), altConsensesToPopulate, reference); - } - } - // otherwise, choose reads for alternate consenses randomly - else { - int readsSeen = 0; - while ( readsSeen++ < MAX_READS_FOR_CONSENSUSES && altConsensesToPopulate.size() <= MAX_CONSENSUSES) { - int index = GenomeAnalysisEngine.getRandomGenerator().nextInt(altAlignmentsToTest.size()); - AlignedRead aRead = altAlignmentsToTest.remove(index); - if ( CHECKEARLY ) createAndAddAlternateConsensus1(aRead, altConsensesToPopulate, reference,leftmostIndex); - else createAndAddAlternateConsensus(aRead.getReadBases(), altConsensesToPopulate, reference); - } - } - } - - private void createAndAddAlternateConsensus(final byte[] read, final Set altConsensesToPopulate, final byte[] reference) { - - // do a pairwise alignment against the reference - SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, read, swParameters); - Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, read); - if ( c != null ) - altConsensesToPopulate.add(c); - } - - private void createAndAddAlternateConsensus1(AlignedRead read, final Set altConsensesToPopulate, - final byte[] reference, final int leftmostIndex) { - - for ( Consensus known : altConsensesToPopulate ) { - Pair altAlignment = findBestOffset(known.str, read, leftmostIndex); - // the mismatch score is the min of its alignment vs. the reference and vs. the alternate - int myScore = altAlignment.second; - if ( myScore == 0 ) {exactMatchesFound++; return; }// read matches perfectly to a known alt consensus - no need to run SW, we already know the answer - } - // do a pairwise alignment against the reference - SWalignmentRuns++; - SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, read.getReadBases(), swParameters); - Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, read.getReadBases()); - if ( c != null ) { - altConsensesToPopulate.add(c); - SWalignmentSuccess++; - } - } - - // create a Consensus from cigar/read strings which originate somewhere on the reference - private Consensus createAlternateConsensus(final int indexOnRef, final Cigar c, final byte[] reference, final byte[] readStr) { - if ( indexOnRef < 0 ) - return null; - - // if there are no indels, we do not need this consensus, can abort early: - if ( c.numCigarElements() == 1 && c.getCigarElement(0).getOperator() == CigarOperator.M ) return null; - - // create the new consensus - ArrayList elements = new ArrayList(c.numCigarElements()-1); - StringBuilder sb = new StringBuilder(); - for (int i = 0; i < indexOnRef; i++) - sb.append((char)reference[i]); - - int indelCount = 0; - int altIdx = 0; - int refIdx = indexOnRef; - boolean ok_flag = true; - for ( int i = 0 ; i < c.numCigarElements() ; i++ ) { - CigarElement ce = c.getCigarElement(i); - int elementLength = ce.getLength(); - switch( ce.getOperator() ) { - case D: - refIdx += elementLength; - indelCount++; - elements.add(ce); - break; - case M: - case EQ: - case X: - altIdx += elementLength; - case N: - if ( reference.length < refIdx + elementLength ) - ok_flag = false; - else { - for (int j = 0; j < elementLength; j++) - sb.append((char)reference[refIdx+j]); - } - refIdx += elementLength; - elements.add(new CigarElement(elementLength, CigarOperator.M)); - break; - case I: - for (int j = 0; j < elementLength; j++) { - if ( ! BaseUtils.isRegularBase(readStr[altIdx+j]) ) { - // Insertions with N's in them cause real problems sometimes; it's better to drop them altogether - ok_flag=false; - break; - } - sb.append((char)readStr[altIdx + j]); - } - altIdx += elementLength; - indelCount++; - elements.add(ce); - break; - case S: - default: - break; - } - } - // make sure that there is at most only a single indel and it aligns appropriately! - if ( !ok_flag || indelCount != 1 || reference.length < refIdx ) - return null; - - for (int i = refIdx; i < reference.length; i++) - sb.append((char)reference[i]); - byte[] altConsensus = StringUtil.stringToBytes(sb.toString()); // alternative consensus sequence we just built from the current read - - return new Consensus(altConsensus, new Cigar(elements), indexOnRef); - } - - // create a Consensus from just the indel string that falls on the reference - private Consensus createAlternateConsensus(final int indexOnRef, final byte[] reference, final byte[] indelStr, final VariantContext indel) { - if ( indexOnRef < 0 || indexOnRef >= reference.length ) - return null; - - // create the new consensus - StringBuilder sb = new StringBuilder(); - Cigar cigar = new Cigar(); - int refIdx; - - for (refIdx = 0; refIdx < indexOnRef; refIdx++) - sb.append((char)reference[refIdx]); - if ( indexOnRef > 0 ) - cigar.add(new CigarElement(indexOnRef, CigarOperator.M)); - - if ( indel.isSimpleDeletion() ) { - refIdx += indelStr.length; - cigar.add(new CigarElement(indelStr.length, CigarOperator.D)); - } - else if ( indel.isSimpleInsertion() ) { - for ( byte b : indelStr ) - sb.append((char)b); - cigar.add(new CigarElement(indelStr.length, CigarOperator.I)); - } else { - throw new IllegalStateException("Creating an alternate consensus from a complex indel is not allows"); - } - - if ( reference.length - refIdx > 0 ) - cigar.add(new CigarElement(reference.length - refIdx, CigarOperator.M)); - for (; refIdx < reference.length; refIdx++) - sb.append((char)reference[refIdx]); - byte[] altConsensus = StringUtil.stringToBytes(sb.toString()); // alternative consensus sequence we just built from the current read - - return new Consensus(altConsensus, cigar, 0); - } - - private Pair findBestOffset(final byte[] ref, final AlignedRead read, final int leftmostIndex) { - - // optimization: try the most likely alignment first (to get a low score to beat) - int originalAlignment = read.getOriginalAlignmentStart() - leftmostIndex; - int bestScore = mismatchQualitySumIgnoreCigar(read, ref, originalAlignment, Integer.MAX_VALUE); - int bestIndex = originalAlignment; - - // optimization: we can't get better than 0, so we can quit now - if ( bestScore == 0 ) - return new Pair(bestIndex, 0); - - // optimization: the correct alignment shouldn't be too far from the original one (or else the read wouldn't have aligned in the first place) - for ( int i = 0; i < originalAlignment; i++ ) { - int score = mismatchQualitySumIgnoreCigar(read, ref, i, bestScore); - if ( score < bestScore ) { - bestScore = score; - bestIndex = i; - } - // optimization: we can't get better than 0, so we can quit now - if ( bestScore == 0 ) - return new Pair(bestIndex, 0); - } - - final int maxPossibleStart = ref.length - read.getReadLength(); - for ( int i = originalAlignment + 1; i <= maxPossibleStart; i++ ) { - int score = mismatchQualitySumIgnoreCigar(read, ref, i, bestScore); - if ( score < bestScore ) { - bestScore = score; - bestIndex = i; - } - // optimization: we can't get better than 0, so we can quit now - if ( bestScore == 0 ) - return new Pair(bestIndex, 0); - } - - return new Pair(bestIndex, bestScore); - } - - - private boolean updateRead(final Cigar altCigar, final int altPosOnRef, final int myPosOnAlt, final AlignedRead aRead, final int leftmostIndex) { - Cigar readCigar = new Cigar(); - - // special case: there is no indel - if ( altCigar.getCigarElements().size() == 1 ) { - aRead.setAlignmentStart(leftmostIndex + myPosOnAlt); - readCigar.add(new CigarElement(aRead.getReadLength(), CigarOperator.M)); - aRead.setCigar(readCigar); - return true; - } - - CigarElement altCE1 = altCigar.getCigarElement(0); - CigarElement altCE2 = altCigar.getCigarElement(1); - - int leadingMatchingBlockLength = 0; // length of the leading M element or 0 if the leading element is I - - CigarElement indelCE; - if ( altCE1.getOperator() == CigarOperator.I ) { - indelCE=altCE1; - if ( altCE2.getOperator() != CigarOperator.M ) { - logger.warn("When the first element of the alt consensus is I, the second one must be M. Actual: " + altCigar.toString() + ". Skipping this site..."); - return false; - } - } - else { - if ( altCE1.getOperator() != CigarOperator.M ) { - logger.warn("First element of the alt consensus cigar must be M or I. Actual: " + altCigar.toString() + ". Skipping this site..."); - return false; - } - if ( altCE2.getOperator() == CigarOperator.I || altCE2.getOperator() == CigarOperator.D ) { - indelCE=altCE2; - } else { - logger.warn("When first element of the alt consensus is M, the second one must be I or D. Actual: " + altCigar.toString() + ". Skipping this site..."); - return false; - } - leadingMatchingBlockLength = altCE1.getLength(); - } - - // the easiest thing to do is to take each case separately - int endOfFirstBlock = altPosOnRef + leadingMatchingBlockLength; - boolean sawAlignmentStart = false; - - // for reads starting before the indel - if ( myPosOnAlt < endOfFirstBlock) { - aRead.setAlignmentStart(leftmostIndex + myPosOnAlt); - sawAlignmentStart = true; - - // for reads ending before the indel - if ( myPosOnAlt + aRead.getReadLength() <= endOfFirstBlock) { - //readCigar.add(new CigarElement(aRead.getReadLength(), CigarOperator.M)); - //aRead.setCigar(readCigar); - aRead.setCigar(null); // reset to original alignment - return true; - } - readCigar.add(new CigarElement(endOfFirstBlock - myPosOnAlt, CigarOperator.M)); - } - - // forward along the indel - //int indelOffsetOnRef = 0, indelOffsetOnRead = 0; - if ( indelCE.getOperator() == CigarOperator.I ) { - // for reads that end in an insertion - if ( myPosOnAlt + aRead.getReadLength() < endOfFirstBlock + indelCE.getLength() ) { - int partialInsertionLength = myPosOnAlt + aRead.getReadLength() - endOfFirstBlock; - // if we also started inside the insertion, then we need to modify the length - if ( !sawAlignmentStart ) - partialInsertionLength = aRead.getReadLength(); - readCigar.add(new CigarElement(partialInsertionLength, CigarOperator.I)); - aRead.setCigar(readCigar); - return true; - } - - // for reads that start in an insertion - if ( !sawAlignmentStart && myPosOnAlt < endOfFirstBlock + indelCE.getLength() ) { - aRead.setAlignmentStart(leftmostIndex + endOfFirstBlock); - readCigar.add(new CigarElement(indelCE.getLength() - (myPosOnAlt - endOfFirstBlock), CigarOperator.I)); - //indelOffsetOnRead = myPosOnAlt - endOfFirstBlock; - sawAlignmentStart = true; - } else if ( sawAlignmentStart ) { - readCigar.add(indelCE); - //indelOffsetOnRead = indelCE.getLength(); - } - } else if ( indelCE.getOperator() == CigarOperator.D ) { - if ( sawAlignmentStart ) - readCigar.add(indelCE); - //indelOffsetOnRef = indelCE.getLength(); - } - - // for reads that start after the indel - if ( !sawAlignmentStart ) { - //aRead.setAlignmentStart(leftmostIndex + myPosOnAlt + indelOffsetOnRef - indelOffsetOnRead); - //readCigar.add(new CigarElement(aRead.getReadLength(), CigarOperator.M)); - //aRead.setCigar(readCigar); - aRead.setCigar(null); // reset to original alignment - return true; - } - - int readRemaining = aRead.getReadBases().length; - for ( CigarElement ce : readCigar.getCigarElements() ) { - if ( ce.getOperator() != CigarOperator.D ) - readRemaining -= ce.getLength(); - } - if ( readRemaining > 0 ) - readCigar.add(new CigarElement(readRemaining, CigarOperator.M)); - aRead.setCigar(readCigar); - - return true; - } - - private boolean alternateReducesEntropy(final List reads, final byte[] reference, final int leftmostIndex) { - final int[] originalMismatchBases = new int[reference.length]; - final int[] cleanedMismatchBases = new int[reference.length]; - final int[] totalOriginalBases = new int[reference.length]; - final int[] totalCleanedBases = new int[reference.length]; - - // set to 1 to prevent dividing by zero - for ( int i=0; i < reference.length; i++ ) - originalMismatchBases[i] = totalOriginalBases[i] = cleanedMismatchBases[i] = totalCleanedBases[i] = 0; - - for (final AlignedRead read : reads) { - if (read.getRead().getAlignmentBlocks().size() > 1) - continue; - - int refIdx = read.getOriginalAlignmentStart() - leftmostIndex; - final byte[] readStr = read.getReadBases(); - final byte[] quals = read.getBaseQualities(); - - for (int j = 0; j < readStr.length; j++, refIdx++) { - if (refIdx < 0 || refIdx >= reference.length) { - //System.out.println( "Read: "+read.getRead().getReadName() + "; length = " + readStr.length() ); - //System.out.println( "Ref left: "+ leftmostIndex +"; ref length=" + reference.length() + "; read alignment start: "+read.getOriginalAlignmentStart() ); - break; - } - totalOriginalBases[refIdx] += quals[j]; - if (readStr[j] != reference[refIdx]) - originalMismatchBases[refIdx] += quals[j]; - } - - // reset and now do the calculation based on the cleaning - refIdx = read.getAlignmentStart() - leftmostIndex; - int altIdx = 0; - Cigar c = read.getCigar(); - for (int j = 0; j < c.numCigarElements(); j++) { - CigarElement ce = c.getCigarElement(j); - int elementLength = ce.getLength(); - switch (ce.getOperator()) { - case M: - case EQ: - case X: - for (int k = 0; k < elementLength; k++, refIdx++, altIdx++) { - if (refIdx >= reference.length) - break; - totalCleanedBases[refIdx] += quals[altIdx]; - if (readStr[altIdx] != reference[refIdx]) - cleanedMismatchBases[refIdx] += quals[altIdx]; - } - break; - case I: - altIdx += elementLength; - break; - case D: - refIdx += elementLength; - break; - case S: - default: - break; - } - } - } - - int originalMismatchColumns = 0, cleanedMismatchColumns = 0; - StringBuilder sb = new StringBuilder(); - for ( int i=0; i < reference.length; i++ ) { - if ( cleanedMismatchBases[i] == originalMismatchBases[i] ) - continue; - boolean didMismatch = false, stillMismatches = false; - if ( originalMismatchBases[i] > totalOriginalBases[i] * MISMATCH_THRESHOLD ) { - didMismatch = true; - originalMismatchColumns++; - if ( totalCleanedBases[i] > 0 && ((double)cleanedMismatchBases[i] / (double)totalCleanedBases[i]) > ((double)originalMismatchBases[i] / (double)totalOriginalBases[i]) * (1.0 - MISMATCH_COLUMN_CLEANED_FRACTION) ) { - stillMismatches = true; - cleanedMismatchColumns++; - } - } else if ( cleanedMismatchBases[i] > totalCleanedBases[i] * MISMATCH_THRESHOLD ) { - cleanedMismatchColumns++; - } - if ( snpsOutput != null ) { - if ( didMismatch ) { - sb.append(reads.get(0).getRead().getReferenceName()).append(":").append(leftmostIndex + i); - if ( stillMismatches ) - sb.append(" SAME_SNP\n"); - else - sb.append(" NOT_SNP\n"); - } - } - } - - //logger.debug("Original mismatch columns = " + originalMismatchColumns + "; cleaned mismatch columns = " + cleanedMismatchColumns); - - final boolean reduces = (originalMismatchColumns == 0 || cleanedMismatchColumns < originalMismatchColumns); - if ( reduces && snpsOutput != null ) { - try { - snpsOutput.write(sb.toString()); - snpsOutput.flush(); - } catch (Exception e) { - throw new UserException.CouldNotCreateOutputFile("snpsOutput", "Failed to write SNPs output file", e); - } - } - return reduces; - } - - protected static Cigar unclipCigar(Cigar cigar) { - ArrayList elements = new ArrayList(cigar.numCigarElements()); - for ( CigarElement ce : cigar.getCigarElements() ) { - if ( !isClipOperator(ce.getOperator()) ) - elements.add(ce); - } - return new Cigar(elements); - } - - private static boolean isClipOperator(CigarOperator op) { - return op == CigarOperator.S || op == CigarOperator.H || op == CigarOperator.P; - } - - protected static Cigar reclipCigar(Cigar cigar, SAMRecord read) { - ArrayList elements = new ArrayList(); - - int i = 0; - int n = read.getCigar().numCigarElements(); - while ( i < n && isClipOperator(read.getCigar().getCigarElement(i).getOperator()) ) - elements.add(read.getCigar().getCigarElement(i++)); - - elements.addAll(cigar.getCigarElements()); - - i++; - while ( i < n && !isClipOperator(read.getCigar().getCigarElement(i).getOperator()) ) - i++; - - while ( i < n && isClipOperator(read.getCigar().getCigarElement(i).getOperator()) ) - elements.add(read.getCigar().getCigarElement(i++)); - - return new Cigar(elements); - } - - private class AlignedRead { - private final GATKSAMRecord read; - private byte[] readBases = null; - private byte[] baseQuals = null; - private Cigar newCigar = null; - private int newStart = -1; - private int mismatchScoreToReference = 0; - private long alignerMismatchScore = 0; - - public AlignedRead(GATKSAMRecord read) { - this.read = read; - mismatchScoreToReference = 0; - } - - public GATKSAMRecord getRead() { - return read; - } - - public int getReadLength() { - return readBases != null ? readBases.length : read.getReadLength(); - } - - public byte[] getReadBases() { - if ( readBases == null ) - getUnclippedBases(); - return readBases; - } - - public byte[] getBaseQualities() { - if ( baseQuals == null ) - getUnclippedBases(); - return baseQuals; - } - - // pull out the bases that aren't clipped out - private void getUnclippedBases() { - readBases = new byte[getReadLength()]; - baseQuals = new byte[getReadLength()]; - byte[] actualReadBases = read.getReadBases(); - byte[] actualBaseQuals = read.getBaseQualities(); - int fromIndex = 0, toIndex = 0; - - for ( CigarElement ce : read.getCigar().getCigarElements() ) { - int elementLength = ce.getLength(); - switch ( ce.getOperator() ) { - case S: - fromIndex += elementLength; - break; - case M: - case EQ: - case X: - case I: - System.arraycopy(actualReadBases, fromIndex, readBases, toIndex, elementLength); - System.arraycopy(actualBaseQuals, fromIndex, baseQuals, toIndex, elementLength); - fromIndex += elementLength; - toIndex += elementLength; - default: - break; - } - } - - // if we got clipped, trim the array - if ( fromIndex != toIndex ) { - byte[] trimmedRB = new byte[toIndex]; - byte[] trimmedBQ = new byte[toIndex]; - System.arraycopy(readBases, 0, trimmedRB, 0, toIndex); - System.arraycopy(baseQuals, 0, trimmedBQ, 0, toIndex); - readBases = trimmedRB; - baseQuals = trimmedBQ; - } - } - - public Cigar getCigar() { - return (newCigar != null ? newCigar : read.getCigar()); - } - - public void setCigar(Cigar cigar) { - setCigar(cigar, true); - } - - // tentatively sets the new Cigar, but it needs to be confirmed later - public void setCigar(Cigar cigar, boolean fixClippedCigar) { - if ( cigar == null ) { - newCigar = null; - return; - } - - if ( fixClippedCigar && getReadBases().length < read.getReadLength() ) - cigar = reclipCigar(cigar); - - // no change? - if ( read.getCigar().equals(cigar) ) { - newCigar = null; - return; - } - - // no indel? - String str = cigar.toString(); - if ( !str.contains("D") && !str.contains("I") ) { - logger.debug("Modifying a read with no associated indel; although this is possible, it is highly unlikely. Perhaps this region should be double-checked: " + read.getReadName() + " near " + read.getReferenceName() + ":" + read.getAlignmentStart()); - // newCigar = null; - // return; - } - - newCigar = cigar; - } - - // pull out the bases that aren't clipped out - private Cigar reclipCigar(Cigar cigar) { - return IndelRealigner.reclipCigar(cigar, read); - } - - // tentatively sets the new start, but it needs to be confirmed later - public void setAlignmentStart(int start) { - newStart = start; - } - - public int getAlignmentStart() { - return (newStart != -1 ? newStart : read.getAlignmentStart()); - } - - public int getOriginalAlignmentStart() { - return read.getAlignmentStart(); - } - - // finalizes the changes made. - // returns true if this record actually changes, false otherwise - public boolean finalizeUpdate() { - // if we haven't made any changes, don't do anything - if ( newCigar == null ) - return false; - if ( newStart == -1 ) - newStart = read.getAlignmentStart(); - else if ( Math.abs(newStart - read.getAlignmentStart()) > MAX_POS_MOVE_ALLOWED ) { - logger.debug(String.format("Attempting to realign read %s at %d more than %d bases to %d.", read.getReadName(), read.getAlignmentStart(), MAX_POS_MOVE_ALLOWED, newStart)); - return false; - } - - // annotate the record with the original cigar (and optionally the alignment start) - if ( !NO_ORIGINAL_ALIGNMENT_TAGS ) { - read.setAttribute(ORIGINAL_CIGAR_TAG, read.getCigar().toString()); - if ( newStart != read.getAlignmentStart() ) - read.setAttribute(ORIGINAL_POSITION_TAG, read.getAlignmentStart()); - } - - read.setCigar(newCigar); - read.setAlignmentStart(newStart); - - return true; - } - - public void setMismatchScoreToReference(int score) { - mismatchScoreToReference = score; - } - - public int getMismatchScoreToReference() { - return mismatchScoreToReference; - } - - public void setAlignerMismatchScore(long score) { - alignerMismatchScore = score; - } - - public long getAlignerMismatchScore() { - return alignerMismatchScore; - } - } - - private static class Consensus { - public final byte[] str; - public final ArrayList> readIndexes; - public final int positionOnReference; - public int mismatchSum; - public Cigar cigar; - - public Consensus(byte[] str, Cigar cigar, int positionOnReference) { - this.str = str; - this.cigar = cigar; - this.positionOnReference = positionOnReference; - mismatchSum = 0; - readIndexes = new ArrayList>(); - } - - @Override - public boolean equals(Object o) { - return ( this == o || (o instanceof Consensus && Arrays.equals(this.str,(((Consensus)o).str)) ) ); - } - - public boolean equals(Consensus c) { - return ( this == c || Arrays.equals(this.str,c.str) ) ; - } - - @Override - public int hashCode() { - return Arrays.hashCode(this.str); - } - } - -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java deleted file mode 100644 index 0b0fa020e..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ /dev/null @@ -1,521 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.indels; - -import com.google.java.contract.Ensures; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.clipping.ReadClipper; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.pairhmm.ArrayLoglessPairHMM; -import org.broadinstitute.sting.utils.pairhmm.Log10PairHMM; -import org.broadinstitute.sting.utils.pairhmm.LoglessPairHMM; -import org.broadinstitute.sting.utils.pairhmm.PairHMM; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.broadinstitute.variant.variantcontext.Allele; - -import java.util.Arrays; -import java.util.LinkedHashMap; -import java.util.LinkedList; -import java.util.Map; - - -public class PairHMMIndelErrorModel { - public static final int BASE_QUAL_THRESHOLD = 20; - - private boolean DEBUG = false; - - private static final int MAX_CACHED_QUAL = 127; - - private static final double baseMatchArray[]; - private static final double baseMismatchArray[]; - - private static final int START_HRUN_GAP_IDX = 4; - private static final int MAX_HRUN_GAP_IDX = 20; - - private static final byte MIN_GAP_OPEN_PENALTY = 30; - private static final byte MIN_GAP_CONT_PENALTY = 10; - private static final byte GAP_PENALTY_HRUN_STEP = 1; // each increase in hrun decreases gap penalty by this. - - private final byte[] GAP_OPEN_PROB_TABLE; - private final byte[] GAP_CONT_PROB_TABLE; - - private final PairHMM pairHMM; - - ///////////////////////////// - // Private Member Variables - ///////////////////////////// - - static { - baseMatchArray = new double[MAX_CACHED_QUAL+1]; - baseMismatchArray = new double[MAX_CACHED_QUAL+1]; - for (int k=1; k <= MAX_CACHED_QUAL; k++) { - double baseProb = Math.pow(10, -k/10.); - - - baseMatchArray[k] = Math.log10(1-baseProb); - baseMismatchArray[k] = Math.log10(baseProb); - } - } - - public PairHMMIndelErrorModel(byte indelGOP, byte indelGCP, boolean deb, final PairHMM.HMM_IMPLEMENTATION hmmType ) { - this.DEBUG = deb; - - switch (hmmType) { - case EXACT: - pairHMM = new Log10PairHMM(true); - break; - case ORIGINAL: - pairHMM = new Log10PairHMM(false); - break; - case LOGLESS_CACHING: - pairHMM = new LoglessPairHMM(); - break; - case ARRAY_LOGLESS: - pairHMM = new ArrayLoglessPairHMM(); - break; - default: - throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the UnifiedGenotyper. Acceptable options are ORIGINAL, EXACT, LOGLESS_CACHING, or ARRAY_LOGLESS."); - } - - // fill gap penalty table, affine naive model: - this.GAP_CONT_PROB_TABLE = new byte[MAX_HRUN_GAP_IDX]; - this.GAP_OPEN_PROB_TABLE = new byte[MAX_HRUN_GAP_IDX]; - - for (int i = 0; i < START_HRUN_GAP_IDX; i++) { - GAP_OPEN_PROB_TABLE[i] = indelGOP; - GAP_CONT_PROB_TABLE[i] = indelGCP; - } - - double step = GAP_PENALTY_HRUN_STEP/10.0; - - // initialize gop and gcp to their default values - byte gop = indelGOP; - byte gcp = indelGCP; - - // all of the following is computed in QUal-space - for (int i=START_HRUN_GAP_IDX; i < MAX_HRUN_GAP_IDX; i++) { - gop -= GAP_PENALTY_HRUN_STEP; - if (gop < MIN_GAP_OPEN_PENALTY) - gop = MIN_GAP_OPEN_PENALTY; - - gcp -= step; - if(gcp < MIN_GAP_CONT_PENALTY) - gcp = MIN_GAP_CONT_PENALTY; - GAP_OPEN_PROB_TABLE[i] = gop; - GAP_CONT_PROB_TABLE[i] = gcp; - } - - } - - static private void getContextHomopolymerLength(final byte[] refBytes, final int[] hrunArray) { - // compute forward hrun length, example: - // AGGTGACCCCCCTGAGAG - // 001000012345000000 - hrunArray[0] = 0; - int[] hforward = new int[hrunArray.length]; - int[] hreverse = new int[hrunArray.length]; - - for (int i = 1; i < refBytes.length; i++) { - if (refBytes[i] == refBytes[i-1]) - hforward[i] = hforward[i-1]+1; - else - hforward[i] = 0; - } - - // do similar thing for reverse length, example: - // AGGTGACCCCCCTGAGAG - // 021000543210000000 - // and then accumulate with forward values. - // Total: - // AGGTGACCCCCCTGAGAG - // 022000555555000000 - for (int i=refBytes.length-1; i > 0; i--) { - if (refBytes[i-1] == refBytes[i]) - hreverse[i-1] += hreverse[i]+1; - } - - for (int i = 1; i < refBytes.length; i++) - hrunArray[i] = hforward[i]+hreverse[i]; - } - - - private void fillGapProbabilities(final int[] hrunProfile, - final byte[] contextLogGapOpenProbabilities, - final byte[] contextLogGapContinuationProbabilities) { - // fill based on lookup table - for (int i = 0; i < hrunProfile.length; i++) { - if (hrunProfile[i] >= MAX_HRUN_GAP_IDX) { - contextLogGapOpenProbabilities[i] = GAP_OPEN_PROB_TABLE[MAX_HRUN_GAP_IDX-1]; - contextLogGapContinuationProbabilities[i] = GAP_CONT_PROB_TABLE[MAX_HRUN_GAP_IDX-1]; - } - else { - contextLogGapOpenProbabilities[i] = GAP_OPEN_PROB_TABLE[hrunProfile[i]]; - contextLogGapContinuationProbabilities[i] = GAP_CONT_PROB_TABLE[hrunProfile[i]]; - } - } - } - - private LinkedHashMap trimHaplotypes(final LinkedHashMap haplotypeMap, - long startLocationInRefForHaplotypes, - long stopLocationInRefForHaplotypes, - final ReferenceContext ref){ - - final LinkedHashMap trimmedHaplotypeMap = new LinkedHashMap<>(); - for (final Allele a: haplotypeMap.keySet()) { - - final Haplotype haplotype = haplotypeMap.get(a); - - if (stopLocationInRefForHaplotypes > haplotype.getStopPosition()) - stopLocationInRefForHaplotypes = haplotype.getStopPosition(); - - if (startLocationInRefForHaplotypes < haplotype.getStartPosition()) - startLocationInRefForHaplotypes = haplotype.getStartPosition(); - else if (startLocationInRefForHaplotypes > haplotype.getStopPosition()) - startLocationInRefForHaplotypes = haplotype.getStopPosition(); - - final long indStart = startLocationInRefForHaplotypes - haplotype.getStartPosition(); - final long indStop = stopLocationInRefForHaplotypes - haplotype.getStartPosition(); - - if (DEBUG) - System.out.format("indStart: %d indStop: %d WinStart:%d WinStop:%d start: %d stop: %d\n", - indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes); - - // get the trimmed haplotype-bases array and create a new haplotype based on it. Pack this into the new map - final byte[] trimmedHaplotypeBases = Arrays.copyOfRange(haplotype.getBases(), (int)indStart, (int)indStop); - final Haplotype trimmedHaplotype = new Haplotype(trimmedHaplotypeBases, haplotype.isReference()); - trimmedHaplotypeMap.put(a, trimmedHaplotype); - } - return trimmedHaplotypeMap; - } - - - public synchronized double[] computeDiploidReadHaplotypeLikelihoods(final ReadBackedPileup pileup, - final LinkedHashMap haplotypeMap, - final ReferenceContext ref, - final int eventLength, - final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, - final double downsamplingFraction) { - final int numHaplotypes = haplotypeMap.size(); - - final int readCounts[] = new int[pileup.getNumberOfElements()]; - final double[][] readLikelihoods = computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap, readCounts); - perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction); - return getDiploidHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods); - - } - - /** - * Should we clip a downstream portion of a read because it spans off the end of a haplotype? - * - * @param read the read in question - * @param refWindowStop the end of the reference window - * @return true if the read needs to be clipped, false otherwise - */ - protected static boolean mustClipDownstream(final GATKSAMRecord read, final int refWindowStop) { - return ( !read.isEmpty() && read.getSoftStart() < refWindowStop && read.getSoftStart() + read.getReadLength() - 1 > refWindowStop ); - } - - /** - * Should we clip a upstream portion of a read because it spans off the end of a haplotype? - * - * @param read the read in question - * @param refWindowStart the start of the reference window - * @return true if the read needs to be clipped, false otherwise - */ - protected static boolean mustClipUpstream(final GATKSAMRecord read, final int refWindowStart) { - return ( !read.isEmpty() && read.getSoftStart() < refWindowStart && read.getSoftEnd() > refWindowStart ); - } - - @Ensures("result != null && result.length == pileup.getNumberOfElements()") - public synchronized double[][] computeGeneralReadHaplotypeLikelihoods(final ReadBackedPileup pileup, - final LinkedHashMap haplotypeMap, - final ReferenceContext ref, - final int eventLength, - final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, - final int[] readCounts) { - final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][haplotypeMap.size()]; - - final LinkedList readList = new LinkedList<>(); - final Map readGCPArrayMap = new LinkedHashMap<>(); - int readIdx=0; - for (PileupElement p: pileup) { - // > 1 when the read is a consensus read representing multiple independent observations - readCounts[readIdx] = p.getRepresentativeCount(); - - // check if we've already computed likelihoods for this pileup element (i.e. for this read at this location) - if (perReadAlleleLikelihoodMap.containsPileupElement(p)) { - Map el = perReadAlleleLikelihoodMap.getLikelihoodsAssociatedWithPileupElement(p); - int j=0; - for (Allele a: haplotypeMap.keySet()) { - readLikelihoods[readIdx][j++] = el.get(a); - } - } - else { - // extra padding on candidate haplotypes to make sure reads are always strictly contained - // in them - a value of 1 will in theory do but we use a slightly higher one just for safety sake, mostly - // in case bases at edge of reads have lower quality. - final int trailingBases = 3; - final int refWindowStart = ref.getWindow().getStart() + trailingBases; - final int refWindowStop = ref.getWindow().getStop() - trailingBases; - - if (DEBUG) { - System.out.format("Read Name:%s, aln start:%d aln stop:%d orig cigar:%s\n",p.getRead().getReadName(), p.getRead().getAlignmentStart(), p.getRead().getAlignmentEnd(), p.getRead().getCigarString()); - } - - GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); - - // if the read extends beyond the downstream (right) end of the reference window, clip it - if ( mustClipDownstream(read, refWindowStop) ) - read = ReadClipper.hardClipByReadCoordinates(read, refWindowStop - read.getSoftStart() + 1, read.getReadLength() - 1); - - // if the read extends beyond the upstream (left) end of the reference window, clip it - if ( mustClipUpstream(read, refWindowStart) ) - read = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, refWindowStart); - - if (read.isEmpty()) - continue; - - // hard-clip low quality ends - this may introduce extra H elements in CIGAR string - read = ReadClipper.hardClipLowQualEnds(read, (byte) BASE_QUAL_THRESHOLD ); - - if (read.isEmpty()) - continue; - - // get bases of candidate haplotypes that overlap with reads - final long readStart = read.getSoftStart(); - final long readEnd = read.getSoftEnd(); - - // see if we want to use soft clipped bases. Aligners may soft clip all bases at insertions because they don't match, - // but they're actually consistent with the insertion! - // Rule: if a read starts in interval [eventStart-eventLength,eventStart+1] and we are at an insertion, we'll use all soft clipped bases at the beginning. - // Conversely, if a read ends at [eventStart,eventStart+eventLength] we'll use all soft clipped bases in the end of the read. - final long eventStartPos = ref.getLocus().getStart(); - - // compute total number of clipped bases (soft or hard clipped) and only use them if necessary - final boolean softClips = useSoftClippedBases(read, eventStartPos, eventLength); - final int numStartSoftClippedBases = softClips ? read.getAlignmentStart()- read.getSoftStart() : 0; - final int numEndSoftClippedBases = softClips ? read.getSoftEnd()- read.getAlignmentEnd() : 0 ; - final byte [] unclippedReadBases = read.getReadBases(); - final byte [] unclippedReadQuals = read.getBaseQualities(); - - /** - * Compute genomic locations that candidate haplotypes will span. - * Read start and stop locations (variables readStart and readEnd) are the original unclipped positions from SAMRecord, - * adjusted by hard clips from Cigar string and by qual-based soft-clipping performed above. - * We will propose haplotypes that overlap the read with some padding. - * True read start = readStart + numStartSoftClippedBases - ReadUtils.getFirstInsertionOffset(read) - * Last term is because if a read starts with an insertion then these bases are not accounted for in readStart. - * trailingBases is a padding constant(=3) and we additionally add abs(eventLength) to both sides of read to be able to - * differentiate context between two haplotypes - */ - final int absEventLength = Math.abs(eventLength); - long startLocationInRefForHaplotypes = Math.max(readStart + numStartSoftClippedBases - trailingBases - ReadUtils.getFirstInsertionOffset(read) - absEventLength, 0); - long stopLocationInRefForHaplotypes = readEnd - numEndSoftClippedBases + trailingBases + ReadUtils.getLastInsertionOffset(read) + absEventLength; - - if (DEBUG) - System.out.format("orig Start:%d orig stop: %d\n", startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes); - - int readLength = read.getReadLength()-numStartSoftClippedBases-numEndSoftClippedBases; - - if (startLocationInRefForHaplotypes < ref.getWindow().getStart()) { - startLocationInRefForHaplotypes = ref.getWindow().getStart(); // read starts before haplotype: read will have to be cut numStartSoftClippedBases += ref.getWindow().getStart() - startLocationInRefForHaplotypes; - } - else if (startLocationInRefForHaplotypes > ref.getWindow().getStop()) { - startLocationInRefForHaplotypes = ref.getWindow().getStop(); // read starts after haplotype: read will have to be clipped completely; - } - - // candidate haplotype cannot go beyond reference context - if (stopLocationInRefForHaplotypes > ref.getWindow().getStop()) { - stopLocationInRefForHaplotypes = ref.getWindow().getStop(); // check also if end of read will go beyond reference context - } - - if (stopLocationInRefForHaplotypes <= startLocationInRefForHaplotypes + readLength) { - stopLocationInRefForHaplotypes = startLocationInRefForHaplotypes + readLength-1; // if there's an insertion in the read, the read stop position will be less than start + read legnth, but we want to compute likelihoods in the whole region that a read might overlap - } - - // ok, we now figured out the total number of clipped bases on both ends. - // Figure out where we want to place the haplotype to score read against - - if (DEBUG) - System.out.format("numStartSoftClippedBases: %d numEndSoftClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n", - numStartSoftClippedBases, numEndSoftClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength()); - - // LinkedHashMap readEl = new LinkedHashMap(); - - /** - * Check if we'll end up with an empty read once all clipping is done - */ - if (numStartSoftClippedBases + numEndSoftClippedBases >= unclippedReadBases.length) { - int j=0; - for (Allele a: haplotypeMap.keySet()) { - perReadAlleleLikelihoodMap.add(p,a,0.0); - readLikelihoods[readIdx][j++] = 0.0; - } - } - else { - final int endOfCopy = unclippedReadBases.length - numEndSoftClippedBases; - final byte[] readBases = Arrays.copyOfRange(unclippedReadBases, numStartSoftClippedBases, endOfCopy); - final byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals, numStartSoftClippedBases, endOfCopy); - - int j=0; - - final byte[] contextLogGapOpenProbabilities = new byte[readBases.length]; - final byte[] contextLogGapContinuationProbabilities = new byte[readBases.length]; - - // get homopolymer length profile for current haplotype - final int[] hrunProfile = new int[readBases.length]; - getContextHomopolymerLength(readBases,hrunProfile); - fillGapProbabilities(hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities); - - // get the base insertion and deletion qualities to use - final byte[] baseInsertionQualities, baseDeletionQualities; - if ( read.hasBaseIndelQualities() ) { - baseInsertionQualities = Arrays.copyOfRange(read.getBaseInsertionQualities(), numStartSoftClippedBases, endOfCopy); - baseDeletionQualities = Arrays.copyOfRange(read.getBaseDeletionQualities(), numStartSoftClippedBases, endOfCopy); - } else { - baseInsertionQualities = contextLogGapOpenProbabilities; - baseDeletionQualities = contextLogGapOpenProbabilities; - } - - // Create a new read based on the current one, but with trimmed bases/quals, for use in the HMM - final GATKSAMRecord processedRead = GATKSAMRecord.createQualityModifiedRead(read, readBases, readQuals, baseInsertionQualities, baseDeletionQualities); - readList.add(processedRead); - - // Pack the shortened read and its associated gap-continuation-penalty array into a map, as required by PairHMM - readGCPArrayMap.put(processedRead,contextLogGapContinuationProbabilities); - - // Create a map of alleles to a new set of haplotypes, whose bases have been trimmed to the appropriate genomic locations - final Map trimmedHaplotypeMap = trimHaplotypes(haplotypeMap, startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, ref); - - // Get the likelihoods for our clipped read against each of our trimmed haplotypes. - final PerReadAlleleLikelihoodMap singleReadRawLikelihoods = pairHMM.computeLikelihoods(readList, trimmedHaplotypeMap, readGCPArrayMap); - - // Pack the original pilup element, each allele, and each associated log10 likelihood into a final map, and add each likelihood to the array - for (Allele a: trimmedHaplotypeMap.keySet()){ - double readLikelihood = singleReadRawLikelihoods.getLikelihoodAssociatedWithReadAndAllele(processedRead, a); - perReadAlleleLikelihoodMap.add(p, a, readLikelihood); - readLikelihoods[readIdx][j++] = readLikelihood; - } - // The readList for sending to the HMM should only ever contain 1 read, as each must be clipped individually - readList.remove(processedRead); - - // The same is true for the read/GCP-array map - readGCPArrayMap.remove(processedRead); - } - } - readIdx++; - } - - if (DEBUG) { - System.out.println("\nLikelihood summary"); - for (readIdx=0; readIdx < pileup.getNumberOfElements(); readIdx++) { - System.out.format("Read Index: %d ",readIdx); - for (int i=0; i < readLikelihoods[readIdx].length; i++) - System.out.format("L%d: %f ",i,readLikelihoods[readIdx][i]); - System.out.println(); - } - - } - - return readLikelihoods; - } - - private boolean useSoftClippedBases(GATKSAMRecord read, long eventStartPos, int eventLength) { - return !((read.getAlignmentStart() >= eventStartPos-eventLength && read.getAlignmentStart() <= eventStartPos+1) || (read.getAlignmentEnd() >= eventStartPos && read.getAlignmentEnd() <= eventStartPos + eventLength)); - } - -// private int computeFirstDifferingPosition(byte[] b1, byte[] b2) { -// if (b1.length != b2.length) -// return 0; // sanity check -// -// for (int i=0; i < b1.length; i++ ){ -// if ( b1[i]!= b2[i] ) -// return i; -// } -// return b1.length; -// } - - private static double[] getDiploidHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) { - final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes]; - - // todo: MAD 09/26/11 -- I'm almost certain this calculation can be simplified to just a single loop without the intermediate NxN matrix - for (int i=0; i < numHaplotypes; i++) { - for (int j=i; j < numHaplotypes; j++){ - // combine likelihoods of haplotypeLikelihoods[i], haplotypeLikelihoods[j] - // L(Hi, Hj) = sum_reads ( Pr(R|Hi)/2 + Pr(R|Hj)/2) - //readLikelihoods[k][j] has log10(Pr(R_k) | H[j] ) - for (int readIdx = 0; readIdx < readLikelihoods.length; readIdx++) { - // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2) - // First term is approximated by Jacobian log with table lookup. - if (Double.isInfinite(readLikelihoods[readIdx][i]) && Double.isInfinite(readLikelihoods[readIdx][j])) - continue; - final double li = readLikelihoods[readIdx][i]; - final double lj = readLikelihoods[readIdx][j]; - final int readCount = readCounts[readIdx]; - haplotypeLikehoodMatrix[i][j] += readCount * (MathUtils.approximateLog10SumLog10(li, lj) + MathUtils.LOG_ONE_HALF); - } - } - } - - final double[] genotypeLikelihoods = new double[numHaplotypes*(numHaplotypes+1)/2]; - int k=0; - for (int j=0; j < numHaplotypes; j++) { - for (int i=0; i <= j; i++){ - genotypeLikelihoods[k++] = haplotypeLikehoodMatrix[i][j]; - } - } - - // renormalize so that max element is zero. - return MathUtils.normalizeFromLog10(genotypeLikelihoods, false, true); - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/Haplotype.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/Haplotype.java deleted file mode 100644 index 688f05934..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/Haplotype.java +++ /dev/null @@ -1,94 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.phasing; - -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.util.Arrays; - -class Haplotype extends BaseArray implements Cloneable { - public Haplotype(byte[] bases) { - super(bases); - } - - private Haplotype(Byte[] bases) { - super(bases); - } - - public Haplotype(Haplotype other) { - super(other); - } - - public Haplotype(BaseArray baseArr) { - super(baseArr.bases); - - if (baseArr.getNonNullIndices().length != baseArr.bases.length) - throw new ReviewedStingException("Should NEVER call Haplotype ctor with null bases!"); - } - - public void updateBase(int index, Byte base) { - if (base == null) { - throw new ReviewedStingException("Internal error: CANNOT have null for a missing Haplotype base!"); - } - super.updateBase(index, base); - } - - public Haplotype clone() { - try { - super.clone(); - } catch (CloneNotSupportedException e) { - e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. - } - return new Haplotype(this); - } - - // Returns a new Haplotype containing the portion of this Haplotype between the specified fromIndex, inclusive, and toIndex, exclusive. - - public Haplotype subHaplotype(int fromIndex, int toIndex) { - return new Haplotype(Arrays.copyOfRange(bases, fromIndex, Math.min(toIndex, size()))); - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java deleted file mode 100644 index 2a31b5425..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ /dev/null @@ -1,989 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.phasing; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.ArgumentCollection; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.samples.Sample; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.sting.utils.variant.GATKVCFUtils; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.vcf.*; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.variant.variantcontext.*; - -import java.io.PrintStream; -import java.util.*; - -/** - * Computes the most likely genotype combination and phases trios and parent/child pairs - * - *

- * PhaseByTransmission is a GATK tool that 1) computes the most likely genotype combination and phases trios and parent/child pairs given their genotype likelihoods and a mutation prior and 2) phases - * all sites were parent/child transmission can be inferred unambiguously. It reports the genotype combination (and hence phasing) probability. - * Ambiguous sites are: - *

    - *
  • Sites where all individuals are heterozygous
  • - *
  • Sites where there is a Mendelian violation
  • - *
- * Missing genotypes are handled as follows: - *
    - *
  • In parent/child pairs: If an individual genotype is missing at one site, the other one is phased if it is homozygous. No phasing probability is emitted.
  • - *
  • In trios: If the child is missing, parents are treated as separate individuals and phased if homozygous. No phasing probability is emitted.
  • - *
  • In trios: If one of the parents is missing, it is handled like a parent/child pair. Phasing is done unless both the parent and child are heterozygous and a phasing probability is emitted.
  • - *
  • In trios: If two individuals are missing, the remaining individual is phased if it is homozygous. No phasing probability is emitted.
  • - *
- * - *

Input

- *

- *

    - *
  • A VCF variant set containing trio(s) and/or parent/child pair(s).
  • - *
  • A PED pedigree file containing the description of the individuals relationships.
  • - *
- *

- * - *

Options

- *

- *

    - *
  • MendelianViolationsFile: An optional argument for reporting. If a file is specified, all sites that remain in mendelian violation after being assigned the most likely genotype - * combination will be reported there. Information reported: chromosome, position, filter, allele count in VCF, family, transmission probability, - * and each individual genotype, depth, allelic depth and likelihoods.
  • - *
  • DeNovoPrior: Mutation prio; default is 1e-8
  • - *
- *

- * - *

Output

- *

- * An VCF with genotypes recalibrated as most likely under the familial constraint and phased by descent where non ambiguous.. - *

- * - *

Examples

- *
- * java -Xmx2g -jar GenomeAnalysisTK.jar \
- *   -R ref.fasta \
- *   -T PhaseByTransmission \
- *   -V input.vcf \
- *   -ped input.ped \
- *   -o output.vcf
- * 
- * - */ -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} ) -public class PhaseByTransmission extends RodWalker, HashMap> { - - @ArgumentCollection - protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); - - @Argument(shortName = "mvf",required = false,fullName = "MendelianViolationsFile", doc="File to output the mendelian violation details.") - private PrintStream mvFile = null; - - @Argument(shortName = "prior",required = false,fullName = "DeNovoPrior", doc="Prior for de novo mutations. Default: 1e-8") - private double deNovoPrior=1e-8; - - @Argument(shortName = "fatherAlleleFirst",required = false,fullName = "FatherAlleleFirst", doc="Ouputs the father allele as the first allele in phased child genotype. i.e. father|mother rather than mother|father.") - private boolean fatherFAlleleFirst=false; - - @Output - protected VariantContextWriter vcfWriter = null; - - private final String TRANSMISSION_PROBABILITY_TAG_NAME = "TP"; - private final String SOURCE_NAME = "PhaseByTransmission"; - - public final double NO_TRANSMISSION_PROB = -1.0; - - private ArrayList trios = new ArrayList(); - - //Matrix of priors for all genotype combinations - private EnumMap>> mvCountMatrix; - - //Matrix of allele transmission - private EnumMap>> transmissionMatrix; - - //Metrics counters hash keys - private final Byte NUM_TRIO_GENOTYPES_CALLED = 0; - private final Byte NUM_TRIO_GENOTYPES_NOCALL = 1; - private final Byte NUM_TRIO_GENOTYPES_PHASED = 2; - private final Byte NUM_TRIO_HET_HET_HET = 3; - private final Byte NUM_TRIO_VIOLATIONS = 4; - private final Byte NUM_TRIO_DOUBLE_VIOLATIONS = 10; - private final Byte NUM_PAIR_GENOTYPES_CALLED = 5; - private final Byte NUM_PAIR_GENOTYPES_NOCALL = 6; - private final Byte NUM_PAIR_GENOTYPES_PHASED = 7; - private final Byte NUM_PAIR_HET_HET = 8; - private final Byte NUM_PAIR_VIOLATIONS = 9; - private final Byte NUM_GENOTYPES_MODIFIED = 11; - - //Random number generator - private Random rand = new Random(); - - private enum FamilyMember { - MOTHER, - FATHER, - CHILD - } - - //Stores a conceptual trio or parent/child pair genotype combination along with its phasing. - //This combination can then be "applied" to a given trio or pair using the getPhasedGenotypes method. - private class TrioPhase { - - //Create 2 fake alleles - //The actual bases will never be used but the Genotypes created using the alleles will be. - private final Allele REF = Allele.create("A",true); - private final Allele VAR = Allele.create("A",false); - private final Allele NO_CALL = Allele.create(".",false); - private final String DUMMY_NAME = "DummySample"; - - private EnumMap trioPhasedGenotypes = new EnumMap(FamilyMember.class); - - private ArrayList getAlleles(GenotypeType genotype){ - ArrayList alleles = new ArrayList(2); - if(genotype == GenotypeType.HOM_REF){ - alleles.add(REF); - alleles.add(REF); - } - else if(genotype == GenotypeType.HET){ - alleles.add(REF); - alleles.add(VAR); - } - else if(genotype == GenotypeType.HOM_VAR){ - alleles.add(VAR); - alleles.add(VAR); - } - else{ - return null; - } - return alleles; - } - - private boolean isPhasable(GenotypeType genotype){ - return genotype == GenotypeType.HOM_REF || genotype == GenotypeType.HET || genotype == GenotypeType.HOM_VAR; - } - - //Create a new Genotype based on information from a single individual - //Homozygous genotypes will be set as phased, heterozygous won't be - private void phaseSingleIndividualAlleles(GenotypeType genotype, FamilyMember familyMember){ - boolean phase = genotype == GenotypeType.HOM_REF || genotype == GenotypeType.HOM_VAR; - trioPhasedGenotypes.put(familyMember, makeGenotype(genotype, phase)); - } - - private Genotype makeGenotype(final GenotypeType type, boolean phase) { - return makeGenotype(getAlleles(type), phase); - } - - private Genotype makeGenotype(final List alleles, boolean phase) { - final GenotypeBuilder gb = new GenotypeBuilder(DUMMY_NAME, alleles); - gb.phased(phase); - return gb.make(); - } - - //Find the phase for a parent/child pair - private void phasePairAlleles(GenotypeType parentGenotype, GenotypeType childGenotype, FamilyMember parent){ - - //Special case for Het/Het as it is ambiguous - if(parentGenotype == GenotypeType.HET && childGenotype == GenotypeType.HET){ - trioPhasedGenotypes.put(parent, makeGenotype(parentGenotype, false)); - trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(childGenotype, false)); - return; - } - - ArrayList parentAlleles = getAlleles(parentGenotype); - ArrayList childAlleles = getAlleles(childGenotype); - ArrayList parentPhasedAlleles = new ArrayList(2); - ArrayList childPhasedAlleles = new ArrayList(2); - - //If there is a possible phasing between the parent and child => phase - int childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(0)); - if(childTransmittedAlleleIndex > -1){ - trioPhasedGenotypes.put(parent, makeGenotype(parentAlleles, true)); - childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex)); - if(parent.equals(FamilyMember.MOTHER)) - childPhasedAlleles.add(childAlleles.get(0)); - else - childPhasedAlleles.add(0,childAlleles.get(0)); - trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(childPhasedAlleles, true)); - } - else if((childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(1))) > -1){ - parentPhasedAlleles.add(parentAlleles.get(1)); - parentPhasedAlleles.add(parentAlleles.get(0)); - trioPhasedGenotypes.put(parent, makeGenotype(parentPhasedAlleles, true)); - childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex)); - if(parent.equals(FamilyMember.MOTHER)) - childPhasedAlleles.add(childAlleles.get(0)); - else - childPhasedAlleles.add(0,childAlleles.get(0)); - trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(childPhasedAlleles, true)); - } - //This is a Mendelian Violation => Do not phase - else{ - trioPhasedGenotypes.put(parent, makeGenotype(parentGenotype, false)); - trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(childGenotype, false)); - } - } - - //Phases a family by transmission - private void phaseFamilyAlleles(GenotypeType mother, GenotypeType father, GenotypeType child){ - - Set> possiblePhasedChildGenotypes = new HashSet>(); - ArrayList motherAlleles = getAlleles(mother); - ArrayList fatherAlleles = getAlleles(father); - ArrayList childAlleles = getAlleles(child); - - //Build all possible child genotypes for the given parent's genotypes - for (Allele momAllele : motherAlleles) { - for (Allele fatherAllele : fatherAlleles) { - ArrayList possiblePhasedChildAlleles = new ArrayList(2); - possiblePhasedChildAlleles.add(momAllele); - possiblePhasedChildAlleles.add(fatherAllele); - possiblePhasedChildGenotypes.add(possiblePhasedChildAlleles); - } - } - - for (ArrayList childPhasedAllelesAlleles : possiblePhasedChildGenotypes) { - int firstAlleleIndex = childPhasedAllelesAlleles.indexOf(childAlleles.get(0)); - int secondAlleleIndex = childPhasedAllelesAlleles.lastIndexOf(childAlleles.get(1)); - //If a possible combination has been found, create the genotypes - if (firstAlleleIndex != secondAlleleIndex && firstAlleleIndex > -1 && secondAlleleIndex > -1) { - //Create mother's genotype - ArrayList motherPhasedAlleles = new ArrayList(2); - motherPhasedAlleles.add(childPhasedAllelesAlleles.get(0)); - if(motherAlleles.get(0) != motherPhasedAlleles.get(0)) - motherPhasedAlleles.add(motherAlleles.get(0)); - else - motherPhasedAlleles.add(motherAlleles.get(1)); - trioPhasedGenotypes.put(FamilyMember.MOTHER, makeGenotype(motherPhasedAlleles, true)); - - //Create father's genotype - ArrayList fatherPhasedAlleles = new ArrayList(2); - fatherPhasedAlleles.add(childPhasedAllelesAlleles.get(1)); - if(fatherAlleles.get(0) != fatherPhasedAlleles.get(0)) - fatherPhasedAlleles.add(fatherAlleles.get(0)); - else - fatherPhasedAlleles.add(fatherAlleles.get(1)); - trioPhasedGenotypes.put(FamilyMember.FATHER, makeGenotype(fatherPhasedAlleles,true)); - - //Create child's genotype - trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(childPhasedAllelesAlleles,true)); - - //Once a phased combination is found; exit - return; - } - } - - //If this is reached then no phasing could be found - trioPhasedGenotypes.put(FamilyMember.MOTHER, makeGenotype(mother,false)); - trioPhasedGenotypes.put(FamilyMember.FATHER, makeGenotype(father,false)); - trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(child,false)); - } - - /* Constructor: Creates a conceptual trio genotype combination from the given genotypes. - If one or more genotypes are set as NO_CALL or UNAVAILABLE, it will phase them like a pair - or single individual. - */ - public TrioPhase(GenotypeType mother, GenotypeType father, GenotypeType child){ - - //Take care of cases where one or more family members are no call - if(!isPhasable(child)){ - phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); - phaseSingleIndividualAlleles(father, FamilyMember.FATHER); - phaseSingleIndividualAlleles(child, FamilyMember.CHILD); - } - else if(!isPhasable(mother)){ - phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); - if(!isPhasable(father)){ - phaseSingleIndividualAlleles(father, FamilyMember.FATHER); - phaseSingleIndividualAlleles(child, FamilyMember.CHILD); - } - else - phasePairAlleles(father, child, FamilyMember.FATHER); - } - else if(!isPhasable(father)){ - phasePairAlleles(mother, child, FamilyMember.MOTHER); - phaseSingleIndividualAlleles(father, FamilyMember.FATHER); - } - //Special case for Het/Het/Het as it is ambiguous - else if(mother == GenotypeType.HET && father == GenotypeType.HET && child == GenotypeType.HET){ - phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); - phaseSingleIndividualAlleles(father, FamilyMember.FATHER); - phaseSingleIndividualAlleles(child, FamilyMember.CHILD); - } - //All family members have genotypes and at least one of them is not Het - else{ - phaseFamilyAlleles(mother, father, child); - } - - //If child should phased genotype should be father first, then swap the alleles - if(fatherFAlleleFirst && trioPhasedGenotypes.get(FamilyMember.CHILD).isPhased()){ - ArrayList childAlleles = new ArrayList(trioPhasedGenotypes.get(FamilyMember.CHILD).getAlleles()); - childAlleles.add(childAlleles.remove(0)); - trioPhasedGenotypes.put(FamilyMember.CHILD,makeGenotype(childAlleles,true)); - } - - } - - /** - * Applies the trio genotype combination to the given trio. - * @param ref: Reference allele - * @param alt: Alternate allele - * @param motherGenotype: Genotype of the mother to phase using this trio genotype combination - * @param fatherGenotype: Genotype of the father to phase using this trio genotype combination - * @param childGenotype: Genotype of the child to phase using this trio genotype combination - * @param transmissionProb: Probability for this trio genotype combination to be correct (pass NO_TRANSMISSION_PROB if unavailable) - * @param phasedGenotypes: An ArrayList to which the newly phased genotypes are added in the following order: Mother, Father, Child - */ - public void getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, double transmissionProb,ArrayList phasedGenotypes){ - phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.MOTHER))); - phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.FATHER))); - phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.CHILD))); - } - - private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, double transmissionProb, Genotype phasedGenotype){ - - int phredScoreTransmission = -1; - if(transmissionProb != NO_TRANSMISSION_PROB){ - double dphredScoreTransmission = QualityUtils.phredScaleLog10ErrorRate(Math.log10(1 - (transmissionProb))); - phredScoreTransmission = dphredScoreTransmission < Byte.MAX_VALUE ? (byte)dphredScoreTransmission : Byte.MAX_VALUE; - } - //Handle null, missing and unavailable genotypes - //Note that only cases where a null/missing/unavailable genotype was passed in the first place can lead to a null/missing/unavailable - //genotype so it is safe to return the original genotype in this case. - //In addition, if the phasing confidence is 0, then return the unphased, original genotypes. - if(phredScoreTransmission ==0 || genotype == null || !isPhasable(genotype.getType())) - return genotype; - - //Add the transmission probability - Map genotypeAttributes = new HashMap(); - genotypeAttributes.putAll(genotype.getExtendedAttributes()); - if(transmissionProb>NO_TRANSMISSION_PROB) - genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, phredScoreTransmission); - - ArrayList phasedAlleles = new ArrayList(2); - for(Allele allele : phasedGenotype.getAlleles()){ - if(allele.isReference()) - phasedAlleles.add(refAllele); - else if(allele.isNonReference()) - phasedAlleles.add(altAllele); - //At this point there should not be any other alleles left - else - throw new UserException(String.format("BUG: Unexpected allele: %s. Please report.",allele.toString())); - - } - - //Compute the new Log10Error if the genotype is different from the original genotype - double log10Error; - if(genotype.getType() == phasedGenotype.getType()) - log10Error = genotype.getLog10PError(); - else - log10Error = genotype.getLikelihoods().getLog10GQ(phasedGenotype.getType()); - - return new GenotypeBuilder(genotype).alleles(phasedAlleles) - .log10PError(log10Error) - .attributes(genotypeAttributes) - .phased(phasedGenotype.isPhased()).make(); - } - - - } - - /** - * Parse the familial relationship specification, build the transmission matrices and initialize VCF writer - */ - public void initialize() { - ArrayList rodNames = new ArrayList(); - rodNames.add(variantCollection.variants.getName()); - Map vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames); - Set vcfSamples = SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); - - //Get the trios from the families passed as ped - setTrios(); - if(trios.size()<1) - throw new UserException.BadInput("No PED file passed or no trios found in PED file. Aborted."); - - - Set headerLines = new HashSet(); - headerLines.addAll(GATKVCFUtils.getHeaderFields(this.getToolkit())); - headerLines.add(new VCFFormatHeaderLine(TRANSMISSION_PROBABILITY_TAG_NAME, 1, VCFHeaderLineType.Integer, "Phred score of the genotype combination and phase given that the genotypes are correct")); - headerLines.add(new VCFHeaderLine("source", SOURCE_NAME)); - vcfWriter.writeHeader(new VCFHeader(headerLines, vcfSamples)); - - buildMatrices(); - - if(mvFile != null) - mvFile.println("CHROM\tPOS\tAC\tFAMILY\tTP\tMOTHER_GT\tMOTHER_DP\tMOTHER_AD\tMOTHER_PL\tFATHER_GT\tFATHER_DP\tFATHER_AD\tFATHER_PL\tCHILD_GT\tCHILD_DP\tCHILD_AD\tCHILD_PL"); - - } - - /** - * Select trios and parent/child pairs only - */ - private void setTrios(){ - - Map> families = this.getSampleDB().getFamilies(); - Set family; - ArrayList parents; - for(Map.Entry> familyEntry : families.entrySet()){ - family = familyEntry.getValue(); - if(family.size()<2 || family.size()>3){ - logger.info(String.format("Caution: Family %s has %d members; At the moment Phase By Transmission only supports trios and parent/child pairs. Family skipped.",familyEntry.getKey(),family.size())); - } - else{ - for(Sample familyMember : family){ - parents = familyMember.getParents(); - if(parents.size()>0){ - if(family.containsAll(parents)) - this.trios.add(familyMember); - else - logger.info(String.format("Caution: Family %s skipped as it is not a trio nor a parent/child pair; At the moment Phase By Transmission only supports trios and parent/child pairs. Family skipped.",familyEntry.getKey())); - break; - } - } - } - - } - - - - } - - //Create the transmission matrices - private void buildMatrices(){ - mvCountMatrix = new EnumMap>>(GenotypeType.class); - transmissionMatrix = new EnumMap>>(GenotypeType.class); - for(GenotypeType mother : GenotypeType.values()){ - mvCountMatrix.put(mother,new EnumMap>(GenotypeType.class)); - transmissionMatrix.put(mother,new EnumMap>(GenotypeType.class)); - for(GenotypeType father : GenotypeType.values()){ - mvCountMatrix.get(mother).put(father,new EnumMap(GenotypeType.class)); - transmissionMatrix.get(mother).put(father,new EnumMap(GenotypeType.class)); - for(GenotypeType child : GenotypeType.values()){ - mvCountMatrix.get(mother).get(father).put(child, getCombinationMVCount(mother, father, child)); - transmissionMatrix.get(mother).get(father).put(child,new TrioPhase(mother,father,child)); - } - } - } - } - - //Returns the number of Mendelian Violations for a given genotype combination. - //If one of the parents genotype is missing, it will consider it as a parent/child pair - //If the child genotype or both parents genotypes are missing, 0 is returned. - private int getCombinationMVCount(GenotypeType mother, GenotypeType father, GenotypeType child){ - - //Child is no call => No MV - if(child == GenotypeType.NO_CALL || child == GenotypeType.UNAVAILABLE) - return 0; - //Add parents with genotypes for the evaluation - ArrayList parents = new ArrayList(); - if (!(mother == GenotypeType.NO_CALL || mother == GenotypeType.UNAVAILABLE)) - parents.add(mother); - if (!(father == GenotypeType.NO_CALL || father == GenotypeType.UNAVAILABLE)) - parents.add(father); - - //Both parents no calls => No MV - if (parents.isEmpty()) - return 0; - - //If at least one parent had a genotype, then count the number of ref and alt alleles that can be passed - int parentsNumRefAlleles = 0; - int parentsNumAltAlleles = 0; - - for(GenotypeType parent : parents){ - if(parent == GenotypeType.HOM_REF){ - parentsNumRefAlleles++; - } - else if(parent == GenotypeType.HET){ - parentsNumRefAlleles++; - parentsNumAltAlleles++; - } - else if(parent == GenotypeType.HOM_VAR){ - parentsNumAltAlleles++; - } - } - - //Case Child is HomRef - if(child == GenotypeType.HOM_REF){ - if(parentsNumRefAlleles == parents.size()) - return 0; - else return (parents.size()-parentsNumRefAlleles); - } - - //Case child is HomVar - if(child == GenotypeType.HOM_VAR){ - if(parentsNumAltAlleles == parents.size()) - return 0; - else return parents.size()-parentsNumAltAlleles; - } - - //Case child is Het - if(child == GenotypeType.HET && ((parentsNumRefAlleles > 0 && parentsNumAltAlleles > 0) || parents.size()<2)) - return 0; - - //MV - return 1; - } - - //Given two trio genotypes combinations, returns the number of different genotypes between the two combinations. - private int countFamilyGenotypeDiff(GenotypeType motherOriginal,GenotypeType fatherOriginal,GenotypeType childOriginal,GenotypeType motherNew,GenotypeType fatherNew,GenotypeType childNew){ - int count = 0; - if(motherOriginal!=motherNew) - count++; - if(fatherOriginal!=fatherNew) - count++; - if(childOriginal!=childNew) - count++; - return count; - } - - //Get a Map of genotype likelihoods. - //In case of null, unavailable or no call, all likelihoods are 1/3. - private EnumMap getLikelihoodsAsMapSafeNull(Genotype genotype){ - if(genotype == null || !genotype.isCalled() || genotype.getLikelihoods() == null){ - EnumMap likelihoods = new EnumMap(GenotypeType.class); - likelihoods.put(GenotypeType.HOM_REF,1.0/3.0); - likelihoods.put(GenotypeType.HET,1.0/3.0); - likelihoods.put(GenotypeType.HOM_VAR,1.0/3.0); - return likelihoods; - } - return genotype.getLikelihoods().getAsMap(true); - } - - //Returns the GenotypeType; returns UNVAILABLE if given null - private GenotypeType getTypeSafeNull(Genotype genotype){ - if(genotype == null) - return GenotypeType.UNAVAILABLE; - return genotype.getType(); - } - - - /** - * Phases the genotypes of the given trio. If one of the parents is null, it is considered a parent/child pair. - * @param ref: Reference allele - * @param alt: Alternative allele - * @param mother: Mother's genotype - * @param father: Father's genotype - * @param child: Child's genotype - * @param finalGenotypes: An ArrayList that will be added the genotypes phased by transmission in the following order: Mother, Father, Child - * @return - */ - private int phaseTrioGenotypes(Allele ref, Allele alt, Genotype mother, Genotype father, Genotype child,ArrayList finalGenotypes) { - - //Check whether it is a pair or trio - //Always assign the first parent as the parent having genotype information in pairs - //Always assign the mother as the first parent in trios - int parentsCalled = 0; - Map firstParentLikelihoods; - Map secondParentLikelihoods; - ArrayList bestFirstParentGenotype = new ArrayList(); - ArrayList bestSecondParentGenotype = new ArrayList(); - ArrayList bestChildGenotype = new ArrayList(); - GenotypeType pairSecondParentGenotype = null; - if(mother == null || !mother.isCalled()){ - firstParentLikelihoods = getLikelihoodsAsMapSafeNull(father); - secondParentLikelihoods = getLikelihoodsAsMapSafeNull(mother); - bestFirstParentGenotype.add(getTypeSafeNull(father)); - bestSecondParentGenotype.add(getTypeSafeNull(mother)); - pairSecondParentGenotype = mother == null ? GenotypeType.UNAVAILABLE : mother.getType(); - if(father != null && father.isCalled()) - parentsCalled = 1; - } - else{ - firstParentLikelihoods = getLikelihoodsAsMapSafeNull(mother); - secondParentLikelihoods = getLikelihoodsAsMapSafeNull(father); - bestFirstParentGenotype.add(getTypeSafeNull(mother)); - bestSecondParentGenotype.add(getTypeSafeNull(father)); - if(father == null || !father.isCalled()){ - parentsCalled = 1; - pairSecondParentGenotype = father == null ? GenotypeType.UNAVAILABLE : father.getType(); - }else{ - parentsCalled = 2; - } - } - Map childLikelihoods = getLikelihoodsAsMapSafeNull(child); - bestChildGenotype.add(getTypeSafeNull(child)); - - //Prior vars - double bestConfigurationLikelihood = 0.0; - double norm = 0.0; - int configuration_index =0; - ArrayList bestMVCount = new ArrayList(); - bestMVCount.add(0); - - //Get the most likely combination - //Only check for most likely combination if at least a parent and the child have genotypes - if(child.isCalled() && parentsCalled > 0){ - int mvCount; - int cumulativeMVCount = 0; - double configurationLikelihood = 0; - for(Map.Entry childGenotype : childLikelihoods.entrySet()){ - for(Map.Entry firstParentGenotype : firstParentLikelihoods.entrySet()){ - for(Map.Entry secondParentGenotype : secondParentLikelihoods.entrySet()){ - mvCount = mvCountMatrix.get(firstParentGenotype.getKey()).get(secondParentGenotype.getKey()).get(childGenotype.getKey()); - //For parent/child pairs, sum over the possible genotype configurations of the missing parent - if(parentsCalled<2){ - cumulativeMVCount += mvCount; - configurationLikelihood += mvCount>0 ? Math.pow(deNovoPrior,mvCount)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue() : (1.0-11*deNovoPrior)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue(); - } - //Evaluate configurations of trios - else{ - configurationLikelihood = mvCount>0 ? Math.pow(deNovoPrior,mvCount)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue() : (1.0-11*deNovoPrior)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue(); - norm += configurationLikelihood; - //Keep this combination if - //It has a better likelihood - //Or it has the same likelihood but requires less changes from original genotypes - if (configurationLikelihood > bestConfigurationLikelihood){ - bestConfigurationLikelihood = configurationLikelihood; - bestMVCount.clear(); - bestMVCount.add(mvCount); - bestFirstParentGenotype.clear(); - bestFirstParentGenotype.add(firstParentGenotype.getKey()); - bestSecondParentGenotype.clear(); - bestSecondParentGenotype.add(secondParentGenotype.getKey()); - bestChildGenotype.clear(); - bestChildGenotype.add(childGenotype.getKey()); - } - else if(configurationLikelihood == bestConfigurationLikelihood) { - bestFirstParentGenotype.add(firstParentGenotype.getKey()); - bestSecondParentGenotype.add(secondParentGenotype.getKey()); - bestChildGenotype.add(childGenotype.getKey()); - bestMVCount.add(mvCount); - } - } - } - //Evaluate configurations of parent/child pairs - if(parentsCalled<2){ - norm += configurationLikelihood; - //Keep this combination if - //It has a better likelihood - //Or it has the same likelihood but requires less changes from original genotypes - if (configurationLikelihood > bestConfigurationLikelihood){ - bestConfigurationLikelihood = configurationLikelihood; - bestMVCount.clear(); - bestMVCount.add(cumulativeMVCount/3); - bestChildGenotype.clear(); - bestFirstParentGenotype.clear(); - bestSecondParentGenotype.clear(); - bestChildGenotype.add(childGenotype.getKey()); - bestFirstParentGenotype.add(firstParentGenotype.getKey()); - bestSecondParentGenotype.add(pairSecondParentGenotype); - } - else if(configurationLikelihood == bestConfigurationLikelihood) { - bestFirstParentGenotype.add(firstParentGenotype.getKey()); - bestSecondParentGenotype.add(pairSecondParentGenotype); - bestChildGenotype.add(childGenotype.getKey()); - bestMVCount.add(cumulativeMVCount/3); - } - configurationLikelihood = 0; - } - } - } - - //normalize the best configuration probability - bestConfigurationLikelihood = bestConfigurationLikelihood / norm; - - //In case of multiple equally likely combinations, take a random one - if(bestFirstParentGenotype.size()>1){ - configuration_index = rand.nextInt(bestFirstParentGenotype.size()-1); - } - - } - else{ - bestConfigurationLikelihood = NO_TRANSMISSION_PROB; - } - - TrioPhase phasedTrioGenotypes; - if(parentsCalled < 2 && mother == null || !mother.isCalled()) - phasedTrioGenotypes = transmissionMatrix.get(bestSecondParentGenotype.get(configuration_index)).get(bestFirstParentGenotype.get(configuration_index)).get(bestChildGenotype.get(configuration_index)); - else - phasedTrioGenotypes = transmissionMatrix.get(bestFirstParentGenotype.get(configuration_index)).get(bestSecondParentGenotype.get(configuration_index)).get(bestChildGenotype.get(configuration_index)); - - //Return the phased genotypes - phasedTrioGenotypes.getPhasedGenotypes(ref,alt,mother,father,child,bestConfigurationLikelihood,finalGenotypes); - return bestMVCount.get(configuration_index); - - } - - - private void updatePairMetricsCounters(Genotype parent, Genotype child, int mvCount, HashMap counters){ - - //Increment metrics counters - if(parent.isCalled() && child.isCalled()){ - counters.put(NUM_PAIR_GENOTYPES_CALLED,counters.get(NUM_PAIR_GENOTYPES_CALLED)+1); - if(parent.isPhased()) - counters.put(NUM_PAIR_GENOTYPES_PHASED,counters.get(NUM_PAIR_GENOTYPES_PHASED)+1); - else{ - counters.put(NUM_PAIR_VIOLATIONS,counters.get(NUM_PAIR_VIOLATIONS)+mvCount); - if(parent.isHet() && child.isHet()) - counters.put(NUM_PAIR_HET_HET,counters.get(NUM_PAIR_HET_HET)+1); - } - }else{ - counters.put(NUM_PAIR_GENOTYPES_NOCALL,counters.get(NUM_PAIR_GENOTYPES_NOCALL)+1); - } - - } - - private void updateTrioMetricsCounters(Genotype mother, Genotype father, Genotype child, int mvCount, HashMap counters){ - - //Increment metrics counters - if(mother.isCalled() && father.isCalled() && child.isCalled()){ - counters.put(NUM_TRIO_GENOTYPES_CALLED,counters.get(NUM_TRIO_GENOTYPES_CALLED)+1); - if(mother.isPhased()) - counters.put(NUM_TRIO_GENOTYPES_PHASED,counters.get(NUM_TRIO_GENOTYPES_PHASED)+1); - - else{ - if(mvCount > 0){ - if(mvCount >1) - counters.put(NUM_TRIO_DOUBLE_VIOLATIONS,counters.get(NUM_TRIO_DOUBLE_VIOLATIONS)+1); - else - counters.put(NUM_TRIO_VIOLATIONS,counters.get(NUM_TRIO_VIOLATIONS)+1); - } - else if(mother.isHet() && father.isHet() && child.isHet()) - counters.put(NUM_TRIO_HET_HET_HET,counters.get(NUM_TRIO_HET_HET_HET)+1); - - } - }else{ - counters.put(NUM_TRIO_GENOTYPES_NOCALL,counters.get(NUM_TRIO_GENOTYPES_NOCALL)+1); - } - } - - /** - * For each variant in the file, determine the phasing for the child and replace the child's genotype with the trio's genotype - * - * @param tracker the reference meta-data tracker - * @param ref the reference context - * @param context the alignment context - * @return null - */ - @Override - public HashMap map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - - HashMap metricsCounters = new HashMap(10); - metricsCounters.put(NUM_TRIO_GENOTYPES_CALLED,0); - metricsCounters.put(NUM_TRIO_GENOTYPES_NOCALL,0); - metricsCounters.put(NUM_TRIO_GENOTYPES_PHASED,0); - metricsCounters.put(NUM_TRIO_HET_HET_HET,0); - metricsCounters.put(NUM_TRIO_VIOLATIONS,0); - metricsCounters.put(NUM_PAIR_GENOTYPES_CALLED,0); - metricsCounters.put(NUM_PAIR_GENOTYPES_NOCALL,0); - metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0); - metricsCounters.put(NUM_PAIR_HET_HET,0); - metricsCounters.put(NUM_PAIR_VIOLATIONS,0); - metricsCounters.put(NUM_TRIO_DOUBLE_VIOLATIONS,0); - metricsCounters.put(NUM_GENOTYPES_MODIFIED,0); - - String mvfLine; - - if (tracker == null) - return metricsCounters; - - final VariantContext vc = tracker.getFirstValue(variantCollection.variants, context.getLocation()); - if ( vc == null ) - return metricsCounters; - - if ( !vc.isBiallelic() ) { - vcfWriter.add(vc); - return metricsCounters; - } - - final VariantContextBuilder builder = new VariantContextBuilder(vc); - - final GenotypesContext genotypesContext = GenotypesContext.copy(vc.getGenotypes()); - for (Sample sample : trios) { - Genotype mother = vc.getGenotype(sample.getMaternalID()); - Genotype father = vc.getGenotype(sample.getPaternalID()); - Genotype child = vc.getGenotype(sample.getID()); - - //Keep only trios and parent/child pairs - if(mother == null && father == null || child == null) - continue; - - ArrayList trioGenotypes = new ArrayList(3); - final int mvCount = phaseTrioGenotypes(vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), mother, father, child,trioGenotypes); - - Genotype phasedMother = trioGenotypes.get(0); - Genotype phasedFather = trioGenotypes.get(1); - Genotype phasedChild = trioGenotypes.get(2); - - //Fill the genotype map with the new genotypes and increment metrics counters - genotypesContext.replace(phasedChild); - if(mother != null){ - genotypesContext.replace(phasedMother); - if(father != null){ - genotypesContext.replace(phasedFather); - updateTrioMetricsCounters(phasedMother,phasedFather,phasedChild,mvCount,metricsCounters); - mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s", - vc.getChr(),vc.getStart(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.getFamilyID(), - phasedMother.getExtendedAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getDP(),printAD(phasedMother.getAD()), - phasedMother.getLikelihoodsString(), phasedFather.getGenotypeString(),phasedFather.getDP(),printAD(phasedFather.getAD()),phasedFather.getLikelihoodsString(), - phasedChild.getGenotypeString(),phasedChild.getDP(),printAD(phasedChild.getAD()),phasedChild.getLikelihoodsString()); - if(!(phasedMother.getType()==mother.getType() && phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType())) - metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1); - } - else{ - updatePairMetricsCounters(phasedMother,phasedChild,mvCount,metricsCounters); - if(!(phasedMother.getType()==mother.getType() && phasedChild.getType()==child.getType())) - metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1); - mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s:%s:%s:%s\t.\t.\t.\t.\t%s\t%s\t%s\t%s", - vc.getChr(),vc.getStart(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.getFamilyID(), - phasedMother.getExtendedAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getDP(),printAD(phasedMother.getAD()),phasedMother.getLikelihoodsString(), - phasedChild.getGenotypeString(),phasedChild.getDP(),printAD(phasedChild.getAD()),phasedChild.getLikelihoodsString()); - } - } - else{ - genotypesContext.replace(phasedFather); - updatePairMetricsCounters(phasedFather,phasedChild,mvCount,metricsCounters); - if(!(phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType())) - metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1); - mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t.\t.\t.\t.\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s", - vc.getChr(),vc.getStart(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.getFamilyID(), - phasedFather.getExtendedAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedFather.getGenotypeString(),phasedFather.getDP(),printAD(phasedFather.getAD()),phasedFather.getLikelihoodsString(), - phasedChild.getGenotypeString(),phasedChild.getDP(),printAD(phasedChild.getAD()),phasedChild.getLikelihoodsString()); - } - - //Report violation if set so - //TODO: ADAPT FOR PAIRS TOO!! - if(mvCount>0 && mvFile != null && !vc.isFiltered()) - mvFile.println(mvfLine); - } - - builder.genotypes(genotypesContext); - vcfWriter.add(builder.make()); - - return metricsCounters; - } - - private static String printAD(final int[] AD) { - if ( AD == null || AD.length == 0 ) - return "."; - final StringBuilder sb = new StringBuilder(); - sb.append(AD[0]); - for ( int i = 1; i < AD.length; i++) { - sb.append(","); - sb.append(AD[i]); - } - return sb.toString(); - } - - /** - * Initializes the reporting counters. - * - * @return All counters initialized to 0 - */ - @Override - public HashMap reduceInit() { - HashMap metricsCounters = new HashMap(10); - metricsCounters.put(NUM_TRIO_GENOTYPES_CALLED,0); - metricsCounters.put(NUM_TRIO_GENOTYPES_NOCALL,0); - metricsCounters.put(NUM_TRIO_GENOTYPES_PHASED,0); - metricsCounters.put(NUM_TRIO_HET_HET_HET,0); - metricsCounters.put(NUM_TRIO_VIOLATIONS,0); - metricsCounters.put(NUM_PAIR_GENOTYPES_CALLED,0); - metricsCounters.put(NUM_PAIR_GENOTYPES_NOCALL,0); - metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0); - metricsCounters.put(NUM_PAIR_HET_HET,0); - metricsCounters.put(NUM_PAIR_VIOLATIONS,0); - metricsCounters.put(NUM_TRIO_DOUBLE_VIOLATIONS,0); - metricsCounters.put(NUM_GENOTYPES_MODIFIED,0); - - return metricsCounters; - } - - /** - * Adds the value of the site phased to the reporting counters. - * - * @param value Site values - * @param sum accumulator for the reporting counters - * @return accumulator with result of the map taken into account. - */ - @Override - public HashMap reduce(HashMap value, HashMap sum) { - sum.put(NUM_TRIO_GENOTYPES_CALLED,value.get(NUM_TRIO_GENOTYPES_CALLED)+sum.get(NUM_TRIO_GENOTYPES_CALLED)); - sum.put(NUM_TRIO_GENOTYPES_NOCALL,value.get(NUM_TRIO_GENOTYPES_NOCALL)+sum.get(NUM_TRIO_GENOTYPES_NOCALL)); - sum.put(NUM_TRIO_GENOTYPES_PHASED,value.get(NUM_TRIO_GENOTYPES_PHASED)+sum.get(NUM_TRIO_GENOTYPES_PHASED)); - sum.put(NUM_TRIO_HET_HET_HET,value.get(NUM_TRIO_HET_HET_HET)+sum.get(NUM_TRIO_HET_HET_HET)); - sum.put(NUM_TRIO_VIOLATIONS,value.get(NUM_TRIO_VIOLATIONS)+sum.get(NUM_TRIO_VIOLATIONS)); - sum.put(NUM_PAIR_GENOTYPES_CALLED,value.get(NUM_PAIR_GENOTYPES_CALLED)+sum.get(NUM_PAIR_GENOTYPES_CALLED)); - sum.put(NUM_PAIR_GENOTYPES_NOCALL,value.get(NUM_PAIR_GENOTYPES_NOCALL)+sum.get(NUM_PAIR_GENOTYPES_NOCALL)); - sum.put(NUM_PAIR_GENOTYPES_PHASED,value.get(NUM_PAIR_GENOTYPES_PHASED)+sum.get(NUM_PAIR_GENOTYPES_PHASED)); - sum.put(NUM_PAIR_HET_HET,value.get(NUM_PAIR_HET_HET)+sum.get(NUM_PAIR_HET_HET)); - sum.put(NUM_PAIR_VIOLATIONS,value.get(NUM_PAIR_VIOLATIONS)+sum.get(NUM_PAIR_VIOLATIONS)); - sum.put(NUM_TRIO_DOUBLE_VIOLATIONS,value.get(NUM_TRIO_DOUBLE_VIOLATIONS)+sum.get(NUM_TRIO_DOUBLE_VIOLATIONS)); - sum.put(NUM_GENOTYPES_MODIFIED,value.get(NUM_GENOTYPES_MODIFIED)+sum.get(NUM_GENOTYPES_MODIFIED)); - - return sum; - } - - - /** - * Reports statistics on the phasing by transmission process. - * @param result Accumulator with all counters. - */ - @Override - public void onTraversalDone(HashMap result) { - logger.info("Number of complete trio-genotypes: " + result.get(NUM_TRIO_GENOTYPES_CALLED)); - logger.info("Number of trio-genotypes containing no call(s): " + result.get(NUM_TRIO_GENOTYPES_NOCALL)); - logger.info("Number of trio-genotypes phased: " + result.get(NUM_TRIO_GENOTYPES_PHASED)); - logger.info("Number of resulting Het/Het/Het trios: " + result.get(NUM_TRIO_HET_HET_HET)); - logger.info("Number of remaining single mendelian violations in trios: " + result.get(NUM_TRIO_VIOLATIONS)); - logger.info("Number of remaining double mendelian violations in trios: " + result.get(NUM_TRIO_DOUBLE_VIOLATIONS)); - logger.info("Number of complete pair-genotypes: " + result.get(NUM_PAIR_GENOTYPES_CALLED)); - logger.info("Number of pair-genotypes containing no call(s): " + result.get(NUM_PAIR_GENOTYPES_NOCALL)); - logger.info("Number of pair-genotypes phased: " + result.get(NUM_PAIR_GENOTYPES_PHASED)); - logger.info("Number of resulting Het/Het pairs: " + result.get(NUM_PAIR_HET_HET)); - logger.info("Number of remaining mendelian violations in pairs: " + result.get(NUM_PAIR_VIOLATIONS)); - logger.info("Number of genotypes updated: " + result.get(NUM_GENOTYPES_MODIFIED)); - - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java deleted file mode 100644 index a297b38cf..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java +++ /dev/null @@ -1,1781 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.phasing; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.ArgumentCollection; -import org.broadinstitute.sting.commandline.Hidden; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.sting.utils.variant.GATKVCFUtils; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.HasGenomeLocation; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.variant.vcf.*; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.variant.variantcontext.*; -import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; -import org.broadinstitute.variant.variantcontext.writer.VariantContextWriterFactory; - -import java.io.*; -import java.util.*; - -import static org.broadinstitute.sting.utils.variant.GATKVCFUtils.getVCFHeadersFromRods; - -/** - * Walks along all variant ROD loci, caching a user-defined window of VariantContext sites, and then finishes phasing them when they go out of range (using upstream and downstream reads). - * - *

- * Performs physical phasing of SNP calls, based on sequencing reads. - *

- * - *

Input

- *

- * VCF file of SNP calls, BAM file of sequence reads. - *

- * - *

Output

- *

- * Phased VCF file. - *

- * - *

Examples

- *
- *    java
- *      -jar GenomeAnalysisTK.jar
- *      -T ReadBackedPhasing
- *      -R reference.fasta
- *      -I reads.bam
- *      --variant SNPs.vcf
- *      -L SNPs.vcf
- *      -o phased_SNPs.vcf
- *      --phaseQualityThresh 20.0
- * 
- * - * @author Menachem Fromer - * @since July 2010 - */ -@Allows(value = {DataSource.READS, DataSource.REFERENCE}) -@Requires(value = {DataSource.READS, DataSource.REFERENCE}) -@By(DataSource.READS) - -// Filter out all reads with zero mapping quality -@ReadFilters({MappingQualityZeroFilter.class}) - -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} ) -public class ReadBackedPhasing extends RodWalker { - @Argument(fullName="debug", shortName="debug", doc="If specified, print out very verbose debug information (if -l DEBUG is also specified)", required = false) - protected boolean DEBUG = false; - /** - * The VCF file we are phasing variants from. - * - * All heterozygous variants found in this VCF file will be phased, where possible - */ - @ArgumentCollection - protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); - - @Output(doc = "File to which variants should be written") - protected VariantContextWriter writer = null; - - @Argument(fullName = "cacheWindowSize", shortName = "cacheWindow", doc = "The window size (in bases) to cache variant sites and their reads for the phasing procedure", required = false) - protected Integer cacheWindow = 20000; - - @Argument(fullName = "maxPhaseSites", shortName = "maxSites", doc = "The maximum number of successive heterozygous sites permitted to be used by the phasing algorithm", required = false) - protected Integer maxPhaseSites = 10; // 2^10 == 10^3 diploid haplotypes - - @Argument(fullName = "phaseQualityThresh", shortName = "phaseThresh", doc = "The minimum phasing quality score required to output phasing", required = false) - protected Double phaseQualityThresh = 10.0; // PQ = 10.0 <=> P(error) = 10^(-10/10) = 0.1, P(correct) = 0.9 - - @Hidden - @Argument(fullName = "variantStatsFilePrefix", shortName = "variantStats", doc = "The prefix of the VCF/phasing statistics files [For DEBUGGING purposes only - DO NOT USE!]", required = false) - protected String variantStatsFilePrefix = null; - private PhasingQualityStatsWriter statsWriter = null; - - @Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for phasing", required = false) - public int MIN_BASE_QUALITY_SCORE = 17; - - @Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for phasing", required = false) - public int MIN_MAPPING_QUALITY_SCORE = 20; - - @Argument(fullName = "sampleToPhase", shortName = "sampleToPhase", doc = "Only include these samples when phasing", required = false) - protected Set samplesToPhase = null; - - @Hidden - @Argument(fullName = "permitNoSampleOverlap", shortName = "permitNoSampleOverlap", doc = "Don't exit (just WARN) when the VCF and BAMs do not overlap in samples", required = false) - private boolean permitNoSampleOverlap = false; - - /** - * Important note: do not use this argument if your input data set is not already phased or it will cause the tool to skip over all heterozygous sites. - */ - @Argument(fullName = "respectPhaseInInput", shortName = "respectPhaseInInput", doc = "Will only phase genotypes in cases where the resulting output will necessarily be consistent with any existing phase (for example, from trios)", required = false) - private boolean respectPhaseInInput = false; - - private GenomeLoc mostDownstreamLocusReached = null; - - private LinkedList unphasedSiteQueue = null; - private CloneableIteratorLinkedList partiallyPhasedSites = null; // the phased VCs to be emitted, and the alignment bases at these positions - - private static PreciseNonNegativeDouble ZERO = new PreciseNonNegativeDouble(0.0); - - public static final String PQ_KEY = "PQ"; - - // In order to detect phase inconsistencies: - private static final double FRACTION_OF_MEAN_PQ_CHANGES = 0.1; // If the PQ decreases by this fraction of the mean PQ changes (thus far), then this read is inconsistent with previous reads - private static final double MAX_FRACTION_OF_INCONSISTENT_READS = 0.1; // If there are more than this fraction of inconsistent reads, then flag this site - - public static final String PHASING_INCONSISTENT_KEY = "PhasingInconsistent"; - - @Argument(fullName = "enableMergePhasedSegregatingPolymorphismsToMNP", shortName = "enableMergeToMNP", doc = "Merge consecutive phased sites into MNP records", required = false) - protected boolean enableMergePhasedSegregatingPolymorphismsToMNP = false; - - @Argument(fullName = "maxGenomicDistanceForMNP", shortName = "maxDistMNP", doc = "The maximum reference-genome distance between consecutive heterozygous sites to permit merging phased VCF records into a MNP record", required = false) - protected int maxGenomicDistanceForMNP = 1; - - @Hidden - @Argument(fullName = "outputMultipleBaseCountsFile", shortName = "outputMultipleBaseCountsFile", doc = "File to output cases where a single read has multiple bases at the same position [For DEBUGGING purposes only - DO NOT USE!]", required = false) - protected File outputMultipleBaseCountsFile = null; - private MultipleBaseCountsWriter outputMultipleBaseCountsWriter = null; - - public void initialize() { - if (maxPhaseSites <= 2) - maxPhaseSites = 2; // by definition, must phase a site relative to previous site [thus, 2 in total] - - /* - Since we cap each base quality (BQ) by its read's mapping quality (MQ) [in Read.updateBaseAndQuality()], then: - if minBQ > minMQ, then we require that MQ be >= minBQ as well. - [Otherwise, we end up capping BQ by MQ only AFTER we tried removing bases with BQ < minBQ, which is WRONG!] - - To do this properly, we set: minMQ = max(minMQ, minBQ) - */ - MIN_MAPPING_QUALITY_SCORE = Math.max(MIN_MAPPING_QUALITY_SCORE, MIN_BASE_QUALITY_SCORE); - - unphasedSiteQueue = new LinkedList(); - partiallyPhasedSites = new CloneableIteratorLinkedList(); - - initializeVcfWriter(); - - if (variantStatsFilePrefix != null) - statsWriter = new PhasingQualityStatsWriter(variantStatsFilePrefix); - - if (outputMultipleBaseCountsFile != null) - outputMultipleBaseCountsWriter = new MultipleBaseCountsWriter(outputMultipleBaseCountsFile); - } - - private void initializeVcfWriter() { - // Wrapper VCFWriters will take ownership of inner writers iff: inner writer != origWriter [which wasn't created here] - VariantContextWriter origWriter = writer; - - if (enableMergePhasedSegregatingPolymorphismsToMNP) - writer = new MergeSegregatingAlternateAllelesVCFWriter(writer, getToolkit().getGenomeLocParser(), getToolkit().getArguments().referenceFile, maxGenomicDistanceForMNP, logger, writer != origWriter); - - /* Due to discardIrrelevantPhasedSites(), the startDistance spanned by [partiallyPhasedSites.peek(), unphasedSiteQueue.peek()] is <= cacheWindow - Due to processQueue(), the startDistance spanned by [unphasedSiteQueue.peek(), mostDownstreamLocusReached] is <= cacheWindow - Hence, the startDistance between: partiallyPhasedSites.peek() --> mostDownstreamLocusReached is <= 2 * cacheWindow - - Therefore, can write the filtered records located at mostDownstreamLocusReached (if any) to SortingVCFWriter, even though partiallyPhasedSites.peek() has not yet been written. - - But, NOTE that map() is careful to pass out a list of records to be written that FIRST includes any records discarded due to having reached mostDownstreamLocusReached, - and only THEN records located at mostDownstreamLocusReached. The opposite order in map() would violate the startDistance limits imposed when contracting SortingVCFWriter with (2 * cacheWindow). - */ - writer = VariantContextWriterFactory.sortOnTheFly(writer, 2 * cacheWindow, writer != origWriter); - - // setup the header fields: - Set hInfo = new HashSet(); - hInfo.addAll(GATKVCFUtils.getHeaderFields(getToolkit())); - hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName())); - - // Phasing-specific INFO fields: - hInfo.add(new VCFFormatHeaderLine(PQ_KEY, 1, VCFHeaderLineType.Float, "Read-backed phasing quality")); - hInfo.add(new VCFInfoHeaderLine(PHASING_INCONSISTENT_KEY, 0, VCFHeaderLineType.Flag, "Are the reads significantly haplotype-inconsistent?")); - - // todo -- fix samplesToPhase - String trackName = variantCollection.variants.getName(); - Map rodNameToHeader = getVCFHeadersFromRods(getToolkit(), Arrays.asList(trackName)); - Set vcfSamples = new TreeSet(samplesToPhase == null ? rodNameToHeader.get(trackName).getGenotypeSamples() : samplesToPhase); - writer.writeHeader(new VCFHeader(hInfo, vcfSamples)); - - Set readSamples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); - readSamples.retainAll(vcfSamples); - if (readSamples.isEmpty()) { - String noPhaseString = "No common samples in VCF and BAM headers" + (samplesToPhase == null ? "" : " (limited to sampleToPhase parameters)") + ", so nothing could possibly be phased!"; - if (permitNoSampleOverlap) - logger.warn(noPhaseString); - else - throw new UserException(noPhaseString); - } - } - - public PhasingStats reduceInit() { - return new PhasingStats(); - } - - /** - * For each site of interest, cache the current site and then use the cache to phase all sites - * for which "sufficient" information has already been observed. - * - * @param tracker the meta-data tracker - * @param ref the reference base - * @param context the context for the given locus - * @return statistics of and list of all phased VariantContexts and their base pileup that have gone out of cacheWindow range. - */ - public PhasingStatsAndOutput map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (tracker == null) - return null; - - mostDownstreamLocusReached = ref.getLocus(); - if (DEBUG) logger.debug("map() at: " + mostDownstreamLocusReached); - - PhasingStats phaseStats = new PhasingStats(); - List unprocessedList = new LinkedList(); - - for (VariantContext vc : tracker.getValues(variantCollection.variants, context.getLocation())) { - if (samplesToPhase != null) vc = reduceVCToSamples(vc, samplesToPhase); - - if (ReadBackedPhasing.processVariantInPhasing(vc)) { - VariantAndReads vr = new VariantAndReads(vc, context); - unphasedSiteQueue.add(vr); - - if (DEBUG) - logger.debug("Added variant to queue = " + GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vr.variant)); - } - else { - unprocessedList.add(vc); // Finished with the unprocessed variant, and writer can enforce sorting on-the-fly - - if (DEBUG) - logger.debug("Unprocessed variant = " + GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc)); - } - - int numReads = context.getBasePileup().getNumberOfElements(); - PhasingStats addInPhaseStats = new PhasingStats(numReads, 1); - phaseStats.addIn(addInPhaseStats); - } - - List completedList = processQueue(phaseStats, false); - completedList.addAll(unprocessedList); // add unprocessedList on to the END of completedList so that the processQueue() results, which are necessarily more upstream, are first! - - return new PhasingStatsAndOutput(phaseStats, completedList); - } - - private static final Set KEYS_TO_KEEP_IN_REDUCED_VCF = new HashSet(Arrays.asList(PQ_KEY)); - - private VariantContext reduceVCToSamples(VariantContext vc, Set samplesToPhase) { -// for ( String sample : samplesToPhase ) -// logger.debug(String.format(" Sample %s has genotype %s, het = %s", sample, vc.getGenotype(sample), vc.getGenotype(sample).isHet() )); - VariantContext subvc = vc.subContextFromSamples(samplesToPhase); -// logger.debug("original VC = " + vc); -// logger.debug("sub VC = " + subvc); - return GATKVariantContextUtils.pruneVariantContext(subvc, KEYS_TO_KEEP_IN_REDUCED_VCF); - } - - private List processQueue(PhasingStats phaseStats, boolean processAll) { - List oldPhasedList = new LinkedList(); - - while (!unphasedSiteQueue.isEmpty()) { - if (!processAll) { // otherwise, phase until the end of unphasedSiteQueue - VariantContext nextToPhaseVc = unphasedSiteQueue.peek().variant; - if (startDistancesAreInWindowRange(mostDownstreamLocusReached, GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), nextToPhaseVc))) { - /* mostDownstreamLocusReached is still not far enough ahead of nextToPhaseVc to have all phasing information for nextToPhaseVc - (note that we ASSUME that the VCF is ordered by ). - Note that this will always leave at least one entry (the last one), since mostDownstreamLocusReached is in range of itself. - */ - break; - } - // Already saw all variant positions within cacheWindow startDistance ahead of vc (on its contig) - } - // Update partiallyPhasedSites before it's used in phaseSite: - oldPhasedList.addAll(discardIrrelevantPhasedSites()); - if (DEBUG) logger.debug("oldPhasedList(1st) = " + toStringVCL(oldPhasedList)); - - VariantAndReads vr = unphasedSiteQueue.remove(); - if (DEBUG) - logger.debug("Performing phasing for " + GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vr.variant)); - phaseSite(vr, phaseStats); - } - - // Update partiallyPhasedSites after phaseSite is done: - oldPhasedList.addAll(discardIrrelevantPhasedSites()); - if (DEBUG) logger.debug("oldPhasedList(2nd) = " + toStringVCL(oldPhasedList)); - - if (outputMultipleBaseCountsWriter != null) - outputMultipleBaseCountsWriter.outputMultipleBaseCounts(); - - return oldPhasedList; - } - - private List discardIrrelevantPhasedSites() { - List vcList = new LinkedList(); - - GenomeLoc nextToPhaseLoc = null; - if (!unphasedSiteQueue.isEmpty()) - nextToPhaseLoc = GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), unphasedSiteQueue.peek().variant); - - while (!partiallyPhasedSites.isEmpty()) { - if (nextToPhaseLoc != null) { // otherwise, unphasedSiteQueue.isEmpty(), and therefore no need to keep any of the "past" - UnfinishedVariantAndReads partPhasedVr = partiallyPhasedSites.peek(); - - if (startDistancesAreInWindowRange(partPhasedVr.unfinishedVariant.getLocation(), nextToPhaseLoc)) - // nextToPhaseLoc is still not far enough ahead of partPhasedVr to exclude partPhasedVr from calculations - break; - } - UnfinishedVariantAndReads uvr = partiallyPhasedSites.remove(); - vcList.add(uvr.unfinishedVariant.toVariantContext()); - } - - return vcList; - } - - /* Phase vc (removed head of unphasedSiteQueue) using all VariantContext objects in - partiallyPhasedSites, and all in unphasedSiteQueue that are within cacheWindow startDistance ahead of vc (on its contig). - - ASSUMES: All VariantContexts in unphasedSiteQueue are in positions downstream of vc (head of queue). - */ - - private void phaseSite(VariantAndReads vr, PhasingStats phaseStats) { - VariantContext vc = vr.variant; - logger.debug("Will phase vc = " + GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc)); - - UnfinishedVariantAndReads uvr = new UnfinishedVariantAndReads(vr); - UnfinishedVariantContext uvc = uvr.unfinishedVariant; - - // Perform per-sample phasing: - GenotypesContext sampGenotypes = vc.getGenotypes(); - Map samplePhaseStats = new TreeMap(); - for (final Genotype gt : sampGenotypes) { - String samp = gt.getSampleName(); - - if (DEBUG) logger.debug("sample = " + samp); - if (isUnfilteredCalledDiploidGenotype(gt)) { - if (gt.isHom()) { // Note that this Genotype may be replaced later to contain the PQ of a downstream het site that was phased relative to a het site lying upstream of this hom site: - // true <-> can trivially phase a hom site relative to ANY previous site: - Genotype phasedGt = new GenotypeBuilder(gt).phased(true).make(); - uvc.setGenotype(samp, phasedGt); - } - else if (gt.isHet()) { // Attempt to phase this het genotype relative to the previous het genotype - PhasingWindow phaseWindow = new PhasingWindow(vr, samp); - if (phaseWindow.hasPreviousHets()) { // Otherwise, nothing to phase this against - SNPallelePair allelePair = new SNPallelePair(gt); - if (DEBUG) logger.debug("Want to phase TOP vs. BOTTOM for: " + "\n" + allelePair); - - CloneableIteratorLinkedList.CloneableIterator prevHetAndInteriorIt = phaseWindow.prevHetAndInteriorIt; - /* Notes: - 1. Call to next() advances iterator to next position in partiallyPhasedSites. - 2. prevHetGenotype != null, since otherwise prevHetAndInteriorIt would not have been chosen to point to its UnfinishedVariantAndReads. - */ - UnfinishedVariantContext prevUvc = prevHetAndInteriorIt.next().unfinishedVariant; - Genotype prevHetGenotype = prevUvc.getGenotype(samp); - - PhaseResult pr = phaseSampleAtSite(phaseWindow); - boolean genotypesArePhased = passesPhasingThreshold(pr.phaseQuality); - - if (pr.phasingContainsInconsistencies) { - if (DEBUG) - logger.debug("MORE than " + (MAX_FRACTION_OF_INCONSISTENT_READS * 100) + "% of the reads are inconsistent for phasing of " + GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc)); - uvc.setPhasingInconsistent(); - } - - if (genotypesArePhased) { - SNPallelePair prevAllelePair = new SNPallelePair(prevHetGenotype); - - if (DEBUG) - logger.debug("THE PHASE PREVIOUSLY CHOSEN FOR PREVIOUS:\n" + prevAllelePair + "\n"); - if (DEBUG) logger.debug("THE PHASE CHOSEN HERE:\n" + allelePair + "\n\n"); - - ensurePhasing(allelePair, prevAllelePair, pr.haplotype); - Genotype phasedGt = new GenotypeBuilder(gt) - .alleles(allelePair.getAllelesAsList()) - .attribute(PQ_KEY, pr.phaseQuality) - .phased(genotypesArePhased).make(); - uvc.setGenotype(samp, phasedGt); - } - - // Now, update the 0 or more "interior" hom sites in between the previous het site and this het site: - while (prevHetAndInteriorIt.hasNext()) { - UnfinishedVariantContext interiorUvc = prevHetAndInteriorIt.next().unfinishedVariant; - Genotype handledGt = interiorUvc.getGenotype(samp); - if (handledGt == null || !isUnfilteredCalledDiploidGenotype(handledGt)) - throw new ReviewedStingException("LOGICAL error: should not have breaks WITHIN haplotype"); - if (!handledGt.isHom()) - throw new ReviewedStingException("LOGICAL error: should not have anything besides hom sites IN BETWEEN two het sites"); - - // Use the same phasing consistency and PQ for each hom site in the "interior" as for the het-het phase: - if (pr.phasingContainsInconsistencies) - interiorUvc.setPhasingInconsistent(); - - if (genotypesArePhased) { - Genotype phasedHomGt = new GenotypeBuilder(handledGt) - .attribute(PQ_KEY, pr.phaseQuality) - .phased(genotypesArePhased).make(); - interiorUvc.setGenotype(samp, phasedHomGt); - } - } - - if (statsWriter != null) - statsWriter.addStat(samp, GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc), startDistance(prevUvc, vc), pr.phaseQuality, phaseWindow.readsAtHetSites.size(), phaseWindow.hetGenotypes.length); - - PhaseCounts sampPhaseCounts = samplePhaseStats.get(samp); - if (sampPhaseCounts == null) { - sampPhaseCounts = new PhaseCounts(); - samplePhaseStats.put(samp, sampPhaseCounts); - } - sampPhaseCounts.numTestedSites++; - - if (pr.phasingContainsInconsistencies) { - if (genotypesArePhased) - sampPhaseCounts.numInconsistentSitesPhased++; - else - sampPhaseCounts.numInconsistentSitesNotPhased++; - } - - if (genotypesArePhased) - sampPhaseCounts.numPhased++; - } - } - } - } - - partiallyPhasedSites.add(uvr); // only add it in now, since don't want it to be there during phasing - phaseStats.addIn(new PhasingStats(samplePhaseStats)); - } - - public boolean passesPhasingThreshold(double PQ) { - return PQ >= phaseQualityThresh; - } - - private static class GenotypeAndReadBases { - public Genotype genotype; - public ReadBasesAtPosition readBases; - public GenomeLoc loc; - - public GenotypeAndReadBases(Genotype genotype, ReadBasesAtPosition readBases, GenomeLoc loc) { - this.genotype = genotype; - this.readBases = readBases; - this.loc = loc; - } - } - - private class PhasingWindow { - private Genotype[] hetGenotypes = null; - private CloneableIteratorLinkedList.CloneableIterator prevHetAndInteriorIt = null; - private int phasingSiteIndex = -1; - private Map readsAtHetSites = null; - - private void clearFields() { - hetGenotypes = null; - prevHetAndInteriorIt = null; - phasingSiteIndex = -1; - readsAtHetSites = null; - } - - public boolean hasPreviousHets() { - return phasingSiteIndex > 0; - } - - // ASSUMES that: isUnfilteredCalledDiploidGenotype(vrGt) && vrGt.isHet() [vrGt = vr.variant.getGenotype(sample)] - - public PhasingWindow(VariantAndReads vr, String sample) { - List listHetGenotypes = new LinkedList(); - - // Include previously phased sites in the phasing computation: - CloneableIteratorLinkedList.CloneableIterator phasedIt = partiallyPhasedSites.iterator(); - while (phasedIt.hasNext()) { - UnfinishedVariantAndReads phasedVr = phasedIt.next(); - Genotype gt = phasedVr.unfinishedVariant.getGenotype(sample); - if (gt == null || !isUnfilteredCalledDiploidGenotype(gt)) { // constructed haplotype must start AFTER this "break" - listHetGenotypes.clear(); // clear out any history - } - else if (gt.isHet()) { - GenotypeAndReadBases grb = new GenotypeAndReadBases(gt, phasedVr.sampleReadBases.get(sample), phasedVr.unfinishedVariant.getLocation()); - listHetGenotypes.add(grb); - if (DEBUG) logger.debug("Using UPSTREAM het site = " + grb.loc); - prevHetAndInteriorIt = phasedIt.clone(); - } - } - phasingSiteIndex = listHetGenotypes.size(); - if (phasingSiteIndex == 0) { // no previous sites against which to phase - clearFields(); - return; - } - prevHetAndInteriorIt.previous(); // so that it points to the previous het site [and NOT one after it, due to the last call to next()] - - if (respectPhaseInInput) { - Genotype prevHetGenotype = prevHetAndInteriorIt.clone().next().unfinishedVariant.getGenotype(sample); - if (!prevHetGenotype.isPhased()) { - // Make this genotype unphaseable, since its previous het is not already phased [as required by respectPhaseInInput]: - clearFields(); - return; - } - } - - // Add the (het) position to be phased: - GenomeLoc phaseLocus = GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vr.variant); - GenotypeAndReadBases grbPhase = new GenotypeAndReadBases(vr.variant.getGenotype(sample), vr.sampleReadBases.get(sample), phaseLocus); - listHetGenotypes.add(grbPhase); - if (DEBUG) - logger.debug("PHASING het site = " + grbPhase.loc + " [phasingSiteIndex = " + phasingSiteIndex + "]"); - - // Include as-of-yet unphased sites in the phasing computation: - for (VariantAndReads nextVr : unphasedSiteQueue) { - if (!startDistancesAreInWindowRange(vr.variant, nextVr.variant)) //nextVr too far ahead of the range used for phasing vc - break; - Genotype gt = nextVr.variant.getGenotype(sample); - if (gt == null || !isUnfilteredCalledDiploidGenotype(gt)) { // constructed haplotype must end BEFORE this "break" - break; - } - else if (gt.isHet()) { - GenotypeAndReadBases grb = new GenotypeAndReadBases(gt, nextVr.sampleReadBases.get(sample), GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), nextVr.variant)); - listHetGenotypes.add(grb); - if (DEBUG) logger.debug("Using DOWNSTREAM het site = " + grb.loc); - } - } - - // First, assemble the "sub-reads" from the COMPLETE WINDOW-BASED SET of heterozygous positions for this sample: - buildReadsAtHetSites(listHetGenotypes, sample, grbPhase.loc); - - // Remove extraneous reads (those that do not "connect" the two core phasing sites): - Set onlyKeepReads = removeExtraneousReads(listHetGenotypes.size()); - - // Dynamically modify the window to only include sites which have a non-empty set of reads: - listHetGenotypes = removeExtraneousSites(listHetGenotypes); - - // In any case, must still trim the window size to be "feasible" - // [**NOTE**: May want to do this to try maximize the preservation of paths from (phasingSiteIndex - 1) to phasingSiteIndex]: - if (listHetGenotypes.size() > maxPhaseSites) { - listHetGenotypes = trimWindow(listHetGenotypes, sample, phaseLocus); - - // Can now remove any extra reads (and then sites): - buildReadsAtHetSites(listHetGenotypes, onlyKeepReads); - onlyKeepReads = removeExtraneousReads(listHetGenotypes.size()); - listHetGenotypes = removeExtraneousSites(listHetGenotypes); - } - - // Lastly, assemble the "sub-reads" from the FINAL SET of heterozygous positions for this sample: - buildReadsAtHetSites(listHetGenotypes, onlyKeepReads); - - // Copy to a fixed-size array: - if (DEBUG) - logger.debug("FINAL phasing window of " + listHetGenotypes.size() + " sites:\n" + toStringGRL(listHetGenotypes)); - hetGenotypes = new Genotype[listHetGenotypes.size()]; - int index = 0; - for (GenotypeAndReadBases copyGrb : listHetGenotypes) - hetGenotypes[index++] = copyGrb.genotype; - } - - private void buildReadsAtHetSites(List listHetGenotypes, String sample, GenomeLoc phasingLoc) { - buildReadsAtHetSites(listHetGenotypes, sample, phasingLoc, null); - } - - private void buildReadsAtHetSites(List listHetGenotypes, Set onlyKeepReads) { - buildReadsAtHetSites(listHetGenotypes, null, null, onlyKeepReads); - } - - private void buildReadsAtHetSites(List listHetGenotypes, String sample, GenomeLoc phasingLoc, Set onlyKeepReads) { - readsAtHetSites = new HashMap(); - - int index = 0; - for (GenotypeAndReadBases grb : listHetGenotypes) { - ReadBasesAtPosition readBases = grb.readBases; - if (readBases != null) { - for (ReadBase rb : readBases) { - String readName = rb.readName; - if (onlyKeepReads != null && !onlyKeepReads.contains(readName)) // if onlyKeepReads exists, ignore reads not in onlyKeepReads - continue; - - PhasingRead rd = readsAtHetSites.get(readName); - if (rd == null) { - rd = new PhasingRead(listHetGenotypes.size(), rb.mappingQual); - readsAtHetSites.put(readName, rd); - } - else if (outputMultipleBaseCountsWriter != null && rd.getBase(index) != null // rd already has a base at index - && sample != null && phasingLoc != null) { - outputMultipleBaseCountsWriter.setMultipleBases(new SampleReadLocus(sample, readName, grb.loc), phasingLoc, rd.getBase(index), rb.base); - } - - // Arbitrarily updates to the last base observed for this sample and read (rb.base): - rd.updateBaseAndQuality(index, rb.base, rb.baseQual); - } - } - index++; - } - if (DEBUG) logger.debug("Number of sites in window = " + index); - - if (DEBUG && logger.isDebugEnabled()) { - logger.debug("ALL READS [phasingSiteIndex = " + phasingSiteIndex + "]:"); - for (Map.Entry nameToReads : readsAtHetSites.entrySet()) { - String rdName = nameToReads.getKey(); - PhasingRead rd = nameToReads.getValue(); - logger.debug(rd + "\t" + rdName); - } - } - } - - private class EdgeToReads { - private TreeMap> edgeReads; - - public EdgeToReads() { - this.edgeReads = new TreeMap>(); // implemented GraphEdge.compareTo() - } - - public void addRead(PhasingGraphEdge e, String readName) { - List reads = edgeReads.get(e); - if (reads == null) { - reads = new LinkedList(); - edgeReads.put(e, reads); - } - reads.add(readName); - } - - public List getReads(PhasingGraphEdge e) { - return edgeReads.get(e); - } - } - - private class IntegerSet implements Iterable { - private Set list; - - public IntegerSet(Set list) { - this.list = list; - } - - public boolean contains(int i) { - return list.contains(i); - } - - public Iterator iterator() { - return list.iterator(); - } - - public String toString() { - StringBuilder sb = new StringBuilder(); - for (int i : this) { - sb.append(i + ", "); - } - return sb.toString(); - } - } - - public Set removeExtraneousReads(int numHetSites) { - PhasingGraph readGraph = new PhasingGraph(numHetSites); - EdgeToReads edgeToReads = new EdgeToReads(); - Set sitesWithEdges = new TreeSet(); - - for (Map.Entry nameToReads : readsAtHetSites.entrySet()) { - String rdName = nameToReads.getKey(); - PhasingRead rd = nameToReads.getValue(); - - int[] siteInds = rd.getNonNullIndices(); - // Connect each pair of non-null sites in rd: - for (int i = 0; i < siteInds.length; i++) { - for (int j = i + 1; j < siteInds.length; j++) { - PhasingGraphEdge e = new PhasingGraphEdge(siteInds[i], siteInds[j]); - if (DEBUG) logger.debug("Read = " + rdName + " is adding edge: " + e); - readGraph.addEdge(e); - - edgeToReads.addRead(e, rdName); - - sitesWithEdges.add(e.getV1()); - sitesWithEdges.add(e.getV2()); - } - } - } - if (DEBUG) logger.debug("Read graph:\n" + readGraph); - Set keepReads = new HashSet(); - - /* Check which Reads are involved in acyclic paths from (phasingSiteIndex - 1) to (phasingSiteIndex): - - In detail: - Every Read links EACH pair of sites for which it contains bases. Then, each such edge is added to a "site connectivity graph". - A read provides non-trivial bias toward the final haplotype decision if it participates in a path from prev ---> cur. This is tested by - considering each edge that the read contributes. For edge e=(v1,v2), if there exists a path from prev ---> v1 [that doesn't include v2] and - cur ---> v2 [that doesn't include v1], then there is a path from prev ---> cur that uses e, hence making the read significant. - By excluding each vertex's edges and then calculating connected components, we are able to make the determination, for example, - if a path exists from prev ---> v1 that excludes v2. - - Furthermore, if the path DOES use other edges that exist solely due to the read, then that's fine, since adding in the read will give those edges as well. - And, if the path uses edges from other reads, then keeping all other reads that contribute those edges - [which will happen since those edges are also in paths from prev ---> cur] is sufficient for this path to exist. - - NOTE: - If we would use NON-UNIFORM priors for the various haplotypes consistent with a margnialized haplotype, then this calculation would not be correct, since the equivalence of: - 1. The read affects the final marginal haplotype posterior probability (for general mapping and base quality values). - 2. The read has edges involved in a path from prev ---> cur. - DEPENDS STRONGLY on the fact that all haplotypes have the same EXACT prior. - - This is due to the following: - [We denote: - R = set of all reads - r = a single read - "AA + CC" = AA on top chromosome, CC on bottom chromosome] - - Note that since there are only two haplotype possibilities: - P(AA + CC | R) + P(AC + CA | R) = 1 - - Now, if we assume that all haplotypes consistent with AA + CC have the same prior probability [P(AA + CC | R)], then: - P(AA + CC | R) - = P(AAAA + CCCC | R) + ... + P(AACC + CCAA | R) - = [P(AAAA + CCCC , R) + ... + P(AACC + CCAA , R)] / P(R) - \propto P(AAAA + CCCC , R) + ... + P(AACC + CCAA , R) - = P(R | AAAA + CCCC)*P(AAAA + CCCC) + ... + P(R | AACC + CCAA)*P(AACC + CCAA) - = P(AA + CC | R) * [P(R | AAAA + CCCC) + ... + P(R | AACC + CCAA)] - - Since we assume independence between reads given a particular haplotype [P(R | AAAA + CCCC) = \prod_r P(r | AAAA + CCCC)], - a new read r affects P(AA + CC | R) by multiplying each of the terms in the sum by, e.g., P(r | AAAA + CCCC). - Therefore, if these values do not affect the ratio of: - (I) [P(R | AAAA + CCCC) + ... + P(R | AACC + CCAA)] / [P(R | ACAA + CACC) + ... + P(R | ACCC + CAAA)] - then they do not affect the value of: - (II) P(AA + CC | R) / P(AC + CA | R) [which uniquely defines their values, since they sum to 1] - - And, the P(r | AAAA + CCCC), ..., P(r | ACCC + CAAA) do not affect ratio (I) iff r's edges do not take part in a path from prev to cur in combination with the other reads in R. - */ - int prev = phasingSiteIndex - 1; - int cur = phasingSiteIndex; - - if (!readGraph.getConnectedComponents().inSameSet(prev, cur)) { // There is NO path between cur and prev - if (DEBUG) - logger.debug("NO READ PATH between PHASE site [" + cur + "] and UPSTREAM site [" + prev + "]"); - readsAtHetSites.clear(); - return keepReads; - } - - /* Check the connected components of prev and cur when removing each individual vertex's edges: - [Total run-time: for each vertex, calculate connected components after removing it's edges: O(V * E)] - */ - IntegerSet[] removedSiteSameCCAsPrev = new IntegerSet[numHetSites]; - IntegerSet[] removedSiteSameCCAsCur = new IntegerSet[numHetSites]; - for (int i : sitesWithEdges) { - if (DEBUG) logger.debug("Calculating CC after removing edges of site: " + i); - - // Remove all edges incident to i and see which positions have paths to prev and cur: - Collection removedEdges = readGraph.removeAllIncidentEdges(i); - - // Run-time for efficiently calculating connected components using DisjointSet: O(E) - DisjointSet ccAfterRemove = readGraph.getConnectedComponents(); - removedSiteSameCCAsPrev[i] = new IntegerSet(ccAfterRemove.inSameSetAs(prev, sitesWithEdges)); - removedSiteSameCCAsCur[i] = new IntegerSet(ccAfterRemove.inSameSetAs(cur, sitesWithEdges)); - - if (DEBUG) logger.debug("Same CC as previous [" + prev + "]: " + removedSiteSameCCAsPrev[i]); - if (DEBUG) logger.debug("Same CC as current [" + cur + "]: " + removedSiteSameCCAsCur[i]); - - // Add the removed edges back in: - readGraph.addEdges(removedEdges); - } - - for (PhasingGraphEdge e : readGraph) { - if (DEBUG) logger.debug("Testing the path-connectivity of Edge: " + e); - - /* Edge e={v1,v2} contributes a path between prev and cur for testRead iff: - testRead[v1] != null, testRead[v2] != null, and there is a path from prev ---> v1 -> v2 ---> cur [or vice versa]. - Note that the path from prev ---> v1 will NOT contain v2, since we removed all of v2's edges, - and the path from v2 ---> cur will NOT contain v1. - */ - boolean prevTo2and1ToCur = removedSiteSameCCAsPrev[e.getV1()].contains(e.getV2()) && removedSiteSameCCAsCur[e.getV2()].contains(e.getV1()); - boolean prevTo1and2ToCur = removedSiteSameCCAsPrev[e.getV2()].contains(e.getV1()) && removedSiteSameCCAsCur[e.getV1()].contains(e.getV2()); - - if (prevTo2and1ToCur || prevTo1and2ToCur) { - for (String readName : edgeToReads.getReads(e)) { - keepReads.add(readName); - - if (DEBUG && logger.isDebugEnabled()) { - if (prevTo2and1ToCur) - logger.debug("Keep read " + readName + " due to path: " + prev + " ---> " + e.getV2() + " -> " + e.getV1() + " ---> " + cur); - else - logger.debug("Keep read " + readName + " due to path: " + prev + " ---> " + e.getV1() + " -> " + e.getV2() + " ---> " + cur); - } - } - } - } - - // Retain only the reads that contain an edge in a path connecting prev and cur: - Iterator> readIt = readsAtHetSites.entrySet().iterator(); - while (readIt.hasNext()) { - Map.Entry nameToReads = readIt.next(); - String rdName = nameToReads.getKey(); - if (!keepReads.contains(rdName)) { - readIt.remove(); - if (DEBUG) logger.debug("Removing extraneous read: " + rdName); - } - } - - return keepReads; - } - - private List removeExtraneousSites(List listHetGenotypes) { - Set sitesWithReads = new HashSet(); - for (Map.Entry nameToReads : readsAtHetSites.entrySet()) { - PhasingRead rd = nameToReads.getValue(); - for (int i : rd.getNonNullIndices()) - sitesWithReads.add(i); - } - - // Remove all sites that have no read bases: - List keepHetSites = new LinkedList(); - int index = 0; - int numPrecedingRemoved = 0; - for (GenotypeAndReadBases grb : listHetGenotypes) { - boolean keepSite = sitesWithReads.contains(index); - if (DEBUG && logger.isDebugEnabled() && !keepSite) - logger.debug("Removing read-less site " + grb.loc); - - if (keepSite || index == phasingSiteIndex || index == phasingSiteIndex - 1) { - keepHetSites.add(grb); - if (!keepSite) - if (DEBUG) - logger.debug("Although current or previous sites have no relevant reads, continuing empty attempt to phase them [for sake of program flow]..."); - } - else if (index <= phasingSiteIndex) - numPrecedingRemoved++; - - index++; - } - - phasingSiteIndex -= numPrecedingRemoved; - return keepHetSites; - } - - private List trimWindow(List listHetGenotypes, String sample, GenomeLoc phaseLocus) { - if (DEBUG) - logger.warn("Trying to phase sample " + sample + " at locus " + phaseLocus + " within a window of " + cacheWindow + " bases yields " + listHetGenotypes.size() + " heterozygous sites to phase:\n" + toStringGRL(listHetGenotypes)); - - int prevSiteIndex = phasingSiteIndex - 1; // index of previous in listHetGenotypes - int numToUse = maxPhaseSites - 2; // since always keep previous and current het sites! - - int numOnLeft = prevSiteIndex; - int numOnRight = listHetGenotypes.size() - (phasingSiteIndex + 1); - - int useOnLeft, useOnRight; - if (numOnLeft <= numOnRight) { - int halfToUse = numToUse / 2; // skimp on the left [floor], and be generous with the right side - useOnLeft = Math.min(halfToUse, numOnLeft); - useOnRight = Math.min(numToUse - useOnLeft, numOnRight); - } - else { // numOnRight < numOnLeft - int halfToUse = new Double(Math.ceil(numToUse / 2.0)).intValue(); // be generous with the right side [ceil] - useOnRight = Math.min(halfToUse, numOnRight); - useOnLeft = Math.min(numToUse - useOnRight, numOnLeft); - } - int startIndex = prevSiteIndex - useOnLeft; - int stopIndex = phasingSiteIndex + useOnRight + 1; // put the index 1 past the desired index to keep - phasingSiteIndex -= startIndex; - listHetGenotypes = listHetGenotypes.subList(startIndex, stopIndex); - if (DEBUG) - logger.warn("NAIVELY REDUCED to " + listHetGenotypes.size() + " sites:\n" + toStringGRL(listHetGenotypes)); - - return listHetGenotypes; - } - } - - private PhaseResult phaseSampleAtSite(PhasingWindow phaseWindow) { - /* Will map a phase and its "complement" to a single representative phase, - and marginalizeAsNewTable() marginalizes to 2 positions [starting at the previous position, and then the current position]: - */ - HaplotypeTableCreator tabCreator = new TableCreatorOfHaplotypeAndComplementForDiploidAlleles(phaseWindow.hetGenotypes, phaseWindow.phasingSiteIndex - 1, 2); - PhasingTable sampleHaps = tabCreator.getNewTable(); - - if (DEBUG && logger.isDebugEnabled()) { - logger.debug("Number of USED reads [connecting the two positions to be phased] at sites: " + phaseWindow.readsAtHetSites.size()); - logger.debug("USED READS:"); - for (Map.Entry nameToReads : phaseWindow.readsAtHetSites.entrySet()) { - String rdName = nameToReads.getKey(); - PhasingRead rd = nameToReads.getValue(); - logger.debug(rd + "\t" + rdName); - } - } - - // Update the phasing table based on each of the sub-reads for this sample: - MaxHaplotypeAndQuality prevMaxHapAndQual = null; - - int numHighQualityIterations = 0; - int numInconsistentIterations = 0; - - double totalAbsPQchange = 0; - int numPQchangesObserved = 0; - - for (Map.Entry nameToReads : phaseWindow.readsAtHetSites.entrySet()) { - PhasingRead rd = nameToReads.getValue(); - if (DEBUG) logger.debug("\nrd = " + rd + "\tname = " + nameToReads.getKey()); - - for (PhasingTable.PhasingTableEntry pte : sampleHaps) { - PhasingScore score = rd.matchHaplotypeClassScore(pte.getHaplotypeClass()); - pte.getScore().integrateReadScore(score); - if (DEBUG) logger.debug("score(" + rd + ", " + pte.getHaplotypeClass() + ") = " + score); - } - - // Check the current best haplotype assignment and compare it to the previous one: - MaxHaplotypeAndQuality curMaxHapAndQual = new MaxHaplotypeAndQuality(sampleHaps, false); - if (DEBUG) - logger.debug("CUR MAX hap:\t" + curMaxHapAndQual.maxEntry.getHaplotypeClass() + "\tcurPhaseQuality:\t" + curMaxHapAndQual.phaseQuality); - if (prevMaxHapAndQual != null) { - double changeInPQ = prevMaxHapAndQual.phaseQuality - curMaxHapAndQual.phaseQuality; - - if (passesPhasingThreshold(prevMaxHapAndQual.phaseQuality)) { - numHighQualityIterations++; - if (!curMaxHapAndQual.hasSameRepresentativeHaplotype(prevMaxHapAndQual) || // switched phase - (numPQchangesObserved > 0 && changeInPQ > FRACTION_OF_MEAN_PQ_CHANGES * (totalAbsPQchange / numPQchangesObserved))) { // a "significant" decrease in PQ - if (DEBUG) logger.debug("Inconsistent read found!"); - numInconsistentIterations++; - } - } - - totalAbsPQchange += Math.abs(changeInPQ); - numPQchangesObserved++; - } - prevMaxHapAndQual = curMaxHapAndQual; - } - - if (DEBUG) logger.debug("\nPhasing table [AFTER CALCULATION]:\n" + sampleHaps + "\n"); - MaxHaplotypeAndQuality maxHapQual = new MaxHaplotypeAndQuality(sampleHaps, DEBUG); - double posteriorProb = maxHapQual.maxEntry.getScore().getValue(); - - if (DEBUG) - logger.debug("MAX hap:\t" + maxHapQual.maxEntry.getHaplotypeClass() + "\tposteriorProb:\t" + posteriorProb + "\tphaseQuality:\t" + maxHapQual.phaseQuality); - if (DEBUG) - logger.debug("Number of used reads " + phaseWindow.readsAtHetSites.size() + "; number of high PQ iterations " + numHighQualityIterations + "; number of inconsistencies " + numInconsistentIterations); - - boolean phasingContainsInconsistencies = false; - if (numInconsistentIterations / (double) numHighQualityIterations > MAX_FRACTION_OF_INCONSISTENT_READS) - phasingContainsInconsistencies = true; - - return new PhaseResult(maxHapQual.getRepresentative(), maxHapQual.phaseQuality, phasingContainsInconsistencies); - } - - private static class MaxHaplotypeAndQuality { - public PhasingTable.PhasingTableEntry maxEntry; - public double phaseQuality; - - public MaxHaplotypeAndQuality(PhasingTable hapTable, boolean printDebug) { - // Marginalize each haplotype to its first 2 positions: - hapTable = HaplotypeTableCreator.marginalizeAsNewTable(hapTable); - if (printDebug) - logger.debug("\nPhasing table [AFTER MAPPING]:\n" + hapTable + "\n"); - - calculateMaxHapAndPhasingQuality(hapTable, printDebug); - } - - // Calculates maxEntry and its PQ (within table hapTable): - - private void calculateMaxHapAndPhasingQuality(PhasingTable hapTable, boolean printDebug) { - hapTable.normalizeScores(); - if (printDebug) - logger.debug("\nPhasing table [AFTER NORMALIZATION]:\n" + hapTable + "\n"); - - // Determine the phase at this position: - this.maxEntry = hapTable.maxEntry(); - - // convert posteriorProb to PHRED scale, but do NOT cap the quality as in QualityUtils.trueProbToQual(posteriorProb): - PreciseNonNegativeDouble sumErrorProbs = new PreciseNonNegativeDouble(ZERO); - for (PhasingTable.PhasingTableEntry pte : hapTable) { - if (pte != maxEntry) - sumErrorProbs.plusEqual(pte.getScore()); - } - this.phaseQuality = -10.0 * (sumErrorProbs.getLog10Value()); - } - - public boolean hasSameRepresentativeHaplotype(MaxHaplotypeAndQuality that) { - return this.getRepresentative().equals(that.getRepresentative()); - } - - private Haplotype getRepresentative() { - return maxEntry.getHaplotypeClass().getRepresentative(); - } - } - - /* - Ensure that curAllelePair is phased relative to prevAllelePair as specified by hap. - */ - - public static void ensurePhasing(SNPallelePair curAllelePair, SNPallelePair prevAllelePair, Haplotype hap) { - if (hap.size() < 2) - throw new ReviewedStingException("LOGICAL ERROR: Only considering haplotypes of length > 2!"); - - byte prevBase = hap.getBase(0); // The 1st base in the haplotype - byte curBase = hap.getBase(1); // The 2nd base in the haplotype - - boolean chosePrevTopChrom = prevAllelePair.matchesTopBase(prevBase); - boolean choseCurTopChrom = curAllelePair.matchesTopBase(curBase); - if (chosePrevTopChrom != choseCurTopChrom) - curAllelePair.swapAlleles(); - } - - private boolean startDistancesAreInWindowRange(VariantContext vc1, VariantContext vc2) { - return startDistancesAreInWindowRange(GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc1), GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc2)); - } - - private boolean startDistancesAreInWindowRange(GenomeLoc loc1, GenomeLoc loc2) { - return loc1.distance(loc2) <= cacheWindow; // distance() checks: loc1.onSameContig(loc2) - } - - private int startDistance(UnfinishedVariantContext uvc1, VariantContext vc2) { - return uvc1.getLocation().distance(GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc2)); - } - - public PhasingStats reduce(PhasingStatsAndOutput statsAndList, PhasingStats stats) { - if (statsAndList != null) { - writeVcList(statsAndList.output); - stats.addIn(statsAndList.ps); - } - return stats; - } - - /** - * Phase anything left in the cached unphasedSiteQueue, and report the number of reads and VariantContexts processed. - * - * @param result the number of reads and VariantContexts seen. - */ - public void onTraversalDone(PhasingStats result) { - List finalList = processQueue(result, true); // process all remaining data - writeVcList(finalList); - writer.close(); - - if (statsWriter != null) - statsWriter.close(); - - if (outputMultipleBaseCountsWriter != null) - outputMultipleBaseCountsWriter.close(); - - System.out.println("Coverage over ALL samples:"); - System.out.println("Number of reads observed: " + result.getNumReads()); - System.out.println("Number of variant sites observed: " + result.getNumVarSites()); - System.out.println("Average coverage: " + ((double) result.getNumReads() / result.getNumVarSites())); - - System.out.println("\n--- Phasing summary [minimal haplotype quality (PQ): " + phaseQualityThresh + ", maxPhaseSites: " + maxPhaseSites + ", cacheWindow: " + cacheWindow + "] ---"); - for (Map.Entry sampPhaseCountEntry : result.getPhaseCounts()) { - PhaseCounts pc = sampPhaseCountEntry.getValue(); - System.out.print("Sample: " + sampPhaseCountEntry.getKey() + "\tSites tested: " + pc.numTestedSites + "\tSites phased: " + pc.numPhased); - System.out.println("\tPhase-inconsistent sites: " + (pc.numInconsistentSitesPhased + pc.numInconsistentSitesNotPhased) + " [phased: " + pc.numInconsistentSitesPhased + ", unphased:" + pc.numInconsistentSitesNotPhased + "]"); - } - System.out.println(""); - } - - private void writeVcList(List varContList) { - for (VariantContext vc : varContList) - writeVCF(vc); - } - - private void writeVCF(VariantContext vc) { - if (samplesToPhase == null || vc.isNotFiltered()) - //if ( samplesToPhase == null || (vc.isVariant() && vc.isNotFiltered())) // if we are only operating on specific samples, don't write out all sites, just those where the VC is variant - writer.add(vc); - } - - public static boolean processVariantInPhasing(VariantContext vc) { - return vc.isNotFiltered() && ((vc.isSNP() && vc.isBiallelic()) || !vc.isVariant()); // we can handle the non-variant case as well - //return isUnfilteredBiallelicSNP(vc); - } - - - /* - Inner classes: - */ - - private class VariantAndReads { - public VariantContext variant; - public HashMap sampleReadBases; - - public VariantAndReads(VariantContext variant, HashMap sampleReadBases) { - this.variant = variant; - this.sampleReadBases = sampleReadBases; - } - - public VariantAndReads(VariantContext variant, AlignmentContext alignment) { - this.variant = variant; - this.sampleReadBases = new HashMap(); - - if (alignment != null) { - ReadBackedPileup pileup = alignment.getBasePileup(); - if (pileup != null) { - // filter the read-base pileup based on min base and mapping qualities: - pileup = pileup.getBaseAndMappingFilteredPileup(MIN_BASE_QUALITY_SCORE, MIN_MAPPING_QUALITY_SCORE); - if (pileup != null) { - for (final String sample : pileup.getSamples()) { - ReadBackedPileup samplePileup = pileup.getPileupForSample(sample); - ReadBasesAtPosition readBases = new ReadBasesAtPosition(); - for (PileupElement p : samplePileup) { - if (!p.isDeletion()) // IGNORE deletions for now - readBases.putReadBase(p); - } - sampleReadBases.put(sample, readBases); - } - } - } - } - } - } - - private class UnfinishedVariantAndReads { - public UnfinishedVariantContext unfinishedVariant; - public HashMap sampleReadBases; - - public UnfinishedVariantAndReads(VariantAndReads vr) { - this.unfinishedVariant = new UnfinishedVariantContext(vr.variant); - this.sampleReadBases = vr.sampleReadBases; - } - } - - // COULD replace with MutableVariantContext if it worked [didn't throw exceptions when trying to call its set() methods]... - - private class UnfinishedVariantContext implements HasGenomeLocation { - private String name; - private String contig; - private int start; - private int stop; - private Collection alleles; - private Map genotypes; - private double log10PError; - private Set filters; - private Map attributes; - private String id; - - public UnfinishedVariantContext(VariantContext vc) { - this.name = vc.getSource(); - this.id = vc.getID(); - this.contig = vc.getChr(); - this.start = vc.getStart(); - this.stop = vc.getEnd(); - this.alleles = vc.getAlleles(); - - this.genotypes = new HashMap(); - for ( final Genotype g : vc.getGenotypes() ) { - this.genotypes.put(g.getSampleName(), g); - } - - this.log10PError = vc.getLog10PError(); - this.filters = vc.filtersWereApplied() ? vc.getFilters() : null; - this.attributes = new HashMap(vc.getAttributes()); - } - - public VariantContext toVariantContext() { - GenotypesContext gc = GenotypesContext.copy(this.genotypes.values()); - return new VariantContextBuilder(name, contig, start, stop, alleles).id(id) - .genotypes(gc).log10PError(log10PError).filters(filters).attributes(attributes).make(); - } - - public GenomeLoc getLocation() { - return getToolkit().getGenomeLocParser().createGenomeLoc(contig, start, stop); - } - - public Genotype getGenotype(String sample) { - return genotypes.get(sample); - } - - public void setGenotype(String sample, Genotype newGt) { - this.genotypes.put(sample, newGt); - } - - public void setPhasingInconsistent() { - attributes.put(PHASING_INCONSISTENT_KEY, true); - } - } - - private static String toStringGRL(List grbList) { - boolean first = true; - StringBuilder sb = new StringBuilder(); - for (GenotypeAndReadBases grb : grbList) { - if (first) - first = false; - else - sb.append(" -- "); - - sb.append(grb.loc); - } - return sb.toString(); - } - - private String toStringVCL(List vcList) { - boolean first = true; - StringBuilder sb = new StringBuilder(); - for (VariantContext vc : vcList) { - if (first) - first = false; - else - sb.append(" -- "); - - sb.append(GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc)); - } - return sb.toString(); - } - -// -// THIS IMPLEMENTATION WILL FAIL WHEN NOT DEALING WITH SNP Alleles (e.g., MNP or INDEL), SINCE THEN THE Allele.getBases() -// FUNCTION WILL RETURN VARIABLE-LENGTH Byte ARRAYS. IN THAT CASE, BaseArray/Haplotype/Read WILL NEED TO BE REPLACED WITH -// AN ArrayList OF Allele [OR SIMILAR OBJECT], and WON'T USE: getSingleBase(alleleI) -// - - private static abstract class HaplotypeTableCreator { - protected Genotype[] genotypes; - - public HaplotypeTableCreator(Genotype[] hetGenotypes) { - this.genotypes = hetGenotypes; - } - - abstract public PhasingTable getNewTable(); - - protected List getAllHaplotypes() { - int numSites = genotypes.length; - int[] genotypeCards = new int[numSites]; - for (int i = 0; i < numSites; i++) - genotypeCards[i] = genotypes[i].getPloidy(); - - LinkedList allHaps = new LinkedList(); - CardinalityCounter alleleCounter = new CardinalityCounter(genotypeCards); - for (int[] alleleInds : alleleCounter) { - byte[] hapBases = new byte[numSites]; - for (int i = 0; i < numSites; i++) { - Allele alleleI = genotypes[i].getAllele(alleleInds[i]); - hapBases[i] = SNPallelePair.getSingleBase(alleleI); - } - allHaps.add(new Haplotype(hapBases)); - } - return allHaps; - } - - public static PhasingTable marginalizeAsNewTable(PhasingTable table) { - TreeMap hapMap = new TreeMap(); - for (PhasingTable.PhasingTableEntry pte : table) { - Haplotype rep = pte.getHaplotypeClass().getRepresentative(); - PreciseNonNegativeDouble score = hapMap.get(rep); - if (score == null) { - score = new PreciseNonNegativeDouble(ZERO); - hapMap.put(rep, score); - } - score.plusEqual(pte.getScore()); - } - - PhasingTable margTable = new PhasingTable(); - for (Map.Entry hapClassAndScore : hapMap.entrySet()) { - Haplotype rep = hapClassAndScore.getKey(); - ArrayList hapList = new ArrayList(); - hapList.add(rep); - - HaplotypeClass hc = new HaplotypeClass(hapList, rep); - margTable.addEntry(hc, hapClassAndScore.getValue()); - } - return margTable; - } - } - - private static class TableCreatorOfHaplotypeAndComplementForDiploidAlleles extends HaplotypeTableCreator { - private SNPallelePair[] SNPallelePairs; - private int startIndex; - private int marginalizeLength; - - public TableCreatorOfHaplotypeAndComplementForDiploidAlleles(Genotype[] hetGenotypes, int startIndex, int marginalizeLength) { - super(hetGenotypes); - - this.SNPallelePairs = new SNPallelePair[genotypes.length]; - for (int i = 0; i < genotypes.length; i++) - SNPallelePairs[i] = new SNPallelePair(genotypes[i]); - - this.startIndex = startIndex; - this.marginalizeLength = marginalizeLength; - } - - public PhasingTable getNewTable() { - PhasingTable table = new PhasingTable(); - for (Haplotype hap : getAllHaplotypes()) { - if (SNPallelePairs[startIndex].matchesTopBase(hap.getBase(startIndex))) { - /* hap is the "representative" haplotype [DEFINED here to be - the one with the top base at the startIndex position. - NOTE that it is CRITICAL that this definition be consistent with the representative sub-haplotypes defined below!] - */ - ArrayList hapList = new ArrayList(); - hapList.add(hap); - hapList.add(complement(hap)); - - // want marginalizeLength positions starting at startIndex: - Haplotype rep = hap.subHaplotype(startIndex, startIndex + marginalizeLength); - double hapClassPrior = getHaplotypeRepresentativePrior(rep); // Note that prior is ONLY a function of the representative haplotype - - HaplotypeClass hapClass = new HaplotypeClass(hapList, rep); - table.addEntry(hapClass, hapClassPrior); - } - } - return table; - } - - // Can change later to weight the representative Haplotypes differently: - - private double getHaplotypeRepresentativePrior(Haplotype rep) { - return 1.0; - } - - private Haplotype complement(Haplotype hap) { - int numSites = SNPallelePairs.length; - if (hap.size() != numSites) - throw new ReviewedStingException("INTERNAL ERROR: hap.size() != numSites"); - - // Take the other base at EACH position of the Haplotype: - byte[] complementBases = new byte[numSites]; - for (int i = 0; i < numSites; i++) - complementBases[i] = SNPallelePairs[i].getOtherBase(hap.getBase(i)); - - return new Haplotype(complementBases); - } - } - - private static class PhasingTable implements Iterable { - private LinkedList table; - - public PhasingTable() { - this.table = new LinkedList(); - } - - public PhasingTableEntry addEntry(HaplotypeClass haplotypeClass, PreciseNonNegativeDouble initialScore) { - PhasingTableEntry pte = new PhasingTableEntry(haplotypeClass, new PhasingScore(initialScore)); - table.add(pte); - return pte; - } - - public PhasingTableEntry addEntry(HaplotypeClass haplotypeClass, double initialScore) { - return addEntry(haplotypeClass, new PreciseNonNegativeDouble(initialScore)); - } - - public Iterator iterator() { - return table.iterator(); - } - - public boolean isEmpty() { - return table.isEmpty(); - } - - public PhasingTableEntry maxEntry() { - if (table.isEmpty()) - return null; - - PhasingTableEntry maxPte = null; - for (PhasingTableEntry pte : table) { - if (maxPte == null || pte.getScore().gt(maxPte.getScore())) { - maxPte = pte; - } - } - return maxPte; - } - - public void normalizeScores() { - PreciseNonNegativeDouble normalizeBy = new PreciseNonNegativeDouble(ZERO); - for (PhasingTableEntry pte : table) - normalizeBy.plusEqual(pte.getScore()); - - if (!normalizeBy.equals(ZERO)) { // prevent precision problems - for (PhasingTableEntry pte : table) - pte.getScore().divEqual(normalizeBy); - } - } - - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append("-------------------\n"); - for (PhasingTableEntry pte : this) { - sb.append("Haplotypes:\t" + pte.getHaplotypeClass() + "\tScore:\t" + pte.getScore() + "\n"); - } - sb.append("-------------------\n"); - return sb.toString(); - } - - public static class PhasingTableEntry implements Comparable { - private HaplotypeClass haplotypeClass; - private PhasingScore score; - - public PhasingTableEntry(HaplotypeClass haplotypeClass, PhasingScore score) { - this.haplotypeClass = haplotypeClass; - this.score = score; - } - - public HaplotypeClass getHaplotypeClass() { - return haplotypeClass; - } - - public PhasingScore getScore() { - return score; - } - - public int compareTo(PhasingTableEntry that) { - return this.getScore().compareTo(that.getScore()); - } - } - } - - private static class PhaseResult { - public Haplotype haplotype; - public double phaseQuality; - public boolean phasingContainsInconsistencies; - - public PhaseResult(Haplotype haplotype, double phaseQuality, boolean phasingContainsInconsistencies) { - this.haplotype = haplotype; - this.phaseQuality = phaseQuality; - this.phasingContainsInconsistencies = phasingContainsInconsistencies; - } - } - - public static boolean isUnfilteredBiallelicSNP(VariantContext vc) { - return (vc.isNotFiltered() && vc.isSNP() && vc.isBiallelic()); - } - - public static boolean isUnfilteredCalledDiploidGenotype(Genotype gt) { - return (! gt.isFiltered() && gt.isCalled() && gt.getPloidy() == 2); - } - - private class MultipleBaseCountsWriter { - private BufferedWriter writer = null; - private TreeMap multipleBaseCounts = null; - - public MultipleBaseCountsWriter(File outputMultipleBaseCountsFile) { - FileOutputStream output; - try { - output = new FileOutputStream(outputMultipleBaseCountsFile); - } catch (FileNotFoundException e) { - throw new RuntimeException("Unable to create multiple base count file at location: " + outputMultipleBaseCountsFile); - } - this.writer = new BufferedWriter(new OutputStreamWriter(output)); - - this.multipleBaseCounts = new TreeMap(); // implemented SampleReadLocus.compareTo() - } - - public void setMultipleBases(SampleReadLocus srl, GenomeLoc phasingLoc, byte prevBase, byte newBase) { - MultipleBaseCounts mbc = multipleBaseCounts.get(srl); - if (mbc == null) { - mbc = new MultipleBaseCounts(phasingLoc); - mbc.incrementBaseCount(prevBase); // only now, do we know to note this - multipleBaseCounts.put(srl, mbc); - } - if (mbc.samePhasingLocAs(phasingLoc)) // otherwise, don't want to count these multiple base counts again - mbc.incrementBaseCount(newBase); - - } - - public void outputMultipleBaseCounts() { - GenomeLoc nextToPhaseLoc = null; - if (!unphasedSiteQueue.isEmpty()) - nextToPhaseLoc = GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), unphasedSiteQueue.peek().variant); - - outputMultipleBaseCounts(nextToPhaseLoc); - } - - private void outputMultipleBaseCounts(GenomeLoc nextToPhaseLoc) { - try { - Iterator> multBaseCountIt = multipleBaseCounts.entrySet().iterator(); - while (multBaseCountIt.hasNext()) { - Map.Entry sampleReadLocBaseCountsEntry = multBaseCountIt.next(); - SampleReadLocus srl = sampleReadLocBaseCountsEntry.getKey(); - if (nextToPhaseLoc == null || !startDistancesAreInWindowRange(srl.getLocus(), nextToPhaseLoc)) { - // Done with entry, so print it and remove it from map: - writer.write(srl + "\t" + sampleReadLocBaseCountsEntry.getValue() + "\n"); - multBaseCountIt.remove(); - } - } - writer.flush(); - } catch (IOException e) { - throw new RuntimeException("Unable to write to outputMultipleBaseCountsFile", e); - } - } - - public void close() { - outputMultipleBaseCounts(null); - - try { - writer.flush(); - writer.close(); - } catch (IOException e) { - throw new RuntimeException("Unable to close outputMultipleBaseCountsFile"); - } - } - } -} - - -class PhasingScore extends PreciseNonNegativeDouble { - public PhasingScore(double score) { - super(score); - } - - public PhasingScore(PreciseNonNegativeDouble val) { - super(val); - } - - public PhasingScore integrateReadScore(PhasingScore score) { - timesEqual(score); - return this; - } -} - -class HaplotypeClass implements Iterable { - private ArrayList haps; - private Haplotype rep; - - public HaplotypeClass(ArrayList haps, Haplotype rep) { - this.haps = haps; - this.rep = rep; - } - - public Iterator iterator() { - return haps.iterator(); - } - - public Haplotype getRepresentative() { - return rep; - } - - public String toString() { - StringBuilder sb = new StringBuilder(); - boolean isFirst = true; - for (Haplotype h : haps) { - if (isFirst) - isFirst = false; - else - sb.append(" + "); - - sb.append(h); - } - sb.append(" [").append(rep).append("]"); - return sb.toString(); - } -} - -class PhasingStats { - private int numReads; - private int numVarSites; - - // Map of: sample -> PhaseCounts: - private Map samplePhaseStats; - - public PhasingStats() { - this(new TreeMap()); - } - - public PhasingStats(int numReads, int numVarSites) { - this.numReads = numReads; - this.numVarSites = numVarSites; - this.samplePhaseStats = new TreeMap(); - } - - public PhasingStats(Map samplePhaseStats) { - this.numReads = 0; - this.numVarSites = 0; - this.samplePhaseStats = samplePhaseStats; - } - - public void addIn(PhasingStats other) { - this.numReads += other.numReads; - this.numVarSites += other.numVarSites; - - for (Map.Entry sampPhaseEntry : other.samplePhaseStats.entrySet()) { - String sample = sampPhaseEntry.getKey(); - PhaseCounts otherCounts = sampPhaseEntry.getValue(); - PhaseCounts thisCounts = this.samplePhaseStats.get(sample); - if (thisCounts == null) { - thisCounts = new PhaseCounts(); - this.samplePhaseStats.put(sample, thisCounts); - } - thisCounts.addIn(otherCounts); - } - } - - public int getNumReads() { - return numReads; - } - - public int getNumVarSites() { - return numVarSites; - } - - public Collection> getPhaseCounts() { - return samplePhaseStats.entrySet(); - } -} - -class PhaseCounts { - public int numTestedSites; // number of het sites directly succeeding het sites - public int numInconsistentSitesPhased; - public int numInconsistentSitesNotPhased; - public int numPhased; - - public PhaseCounts() { - this.numTestedSites = 0; - this.numInconsistentSitesPhased = 0; - this.numInconsistentSitesNotPhased = 0; - this.numPhased = 0; - } - - public void addIn(PhaseCounts other) { - this.numTestedSites += other.numTestedSites; - this.numInconsistentSitesPhased += other.numInconsistentSitesPhased; - this.numInconsistentSitesNotPhased += other.numInconsistentSitesNotPhased; - this.numPhased += other.numPhased; - } -} - -class PhasingStatsAndOutput { - public PhasingStats ps; - public List output; - - public PhasingStatsAndOutput(PhasingStats ps, List output) { - this.ps = ps; - this.output = output; - } -} - -class PhasingQualityStatsWriter { - private String variantStatsFilePrefix; - private HashMap sampleToStatsWriter = new HashMap(); - - public PhasingQualityStatsWriter(String variantStatsFilePrefix) { - this.variantStatsFilePrefix = variantStatsFilePrefix; - } - - public void addStat(String sample, GenomeLoc locus, int startDistanceFromPrevious, double phasingQuality, int numReads, int windowSize) { - BufferedWriter sampWriter = sampleToStatsWriter.get(sample); - if (sampWriter == null) { - String fileName = variantStatsFilePrefix + "." + sample + ".locus_distance_PQ_numReads_windowSize.txt"; - - FileOutputStream output; - try { - output = new FileOutputStream(fileName); - } catch (FileNotFoundException e) { - throw new RuntimeException("Unable to create phasing quality stats file at location: " + fileName); - } - sampWriter = new BufferedWriter(new OutputStreamWriter(output)); - sampleToStatsWriter.put(sample, sampWriter); - } - try { - sampWriter.write(locus + "\t" + startDistanceFromPrevious + "\t" + phasingQuality + "\t" + numReads + "\t" + windowSize + "\n"); - sampWriter.flush(); - } catch (IOException e) { - throw new RuntimeException("Unable to write to per-sample phasing quality stats file", e); - } - } - - public void close() { - for (Map.Entry sampWriterEntry : sampleToStatsWriter.entrySet()) { - BufferedWriter sampWriter = sampWriterEntry.getValue(); - try { - sampWriter.flush(); - sampWriter.close(); - } catch (IOException e) { - throw new RuntimeException("Unable to close per-sample phasing quality stats file"); - } - } - } -} - -class SampleReadLocus implements Comparable { - private String sample; - private String read; - private GenomeLoc locus; - - public SampleReadLocus(String sample, String read, GenomeLoc locus) { - this.sample = sample; - this.read = read; - this.locus = locus; - } - - public GenomeLoc getLocus() { - return locus; - } - - public int compareTo(SampleReadLocus that) { - int comp = this.sample.compareTo(that.sample); - if (comp != 0) - return comp; - - comp = this.read.compareTo(that.read); - if (comp != 0) - return comp; - - return this.locus.compareTo(that.locus); - } - - public String toString() { - return "Sample " + sample + ", read " + read + ", locus " + locus; - } -} - -class MultipleBaseCounts { - private Map baseCounts; - private GenomeLoc phasingLocus; - - public MultipleBaseCounts(GenomeLoc phasingLoc) { - this.baseCounts = new HashMap(); - this.phasingLocus = phasingLoc; - } - - public boolean samePhasingLocAs(GenomeLoc loc) { - return phasingLocus.equals(loc); - } - - public void incrementBaseCount(byte base) { - int baseIndex = BaseUtils.simpleBaseToBaseIndex(base); - Integer cnt = baseCounts.get(baseIndex); - if (cnt == null) - cnt = 0; - - baseCounts.put(baseIndex, cnt + 1); - } - - public String toString() { - StringBuilder sb = new StringBuilder(); - - sb.append("Base counts"); - for (Map.Entry baseCountEntry : baseCounts.entrySet()) { - byte base = BaseUtils.baseIndexToSimpleBase(baseCountEntry.getKey()); - int cnt = baseCountEntry.getValue(); - sb.append("\t" + (char) base + ": " + cnt); - } - - return sb.toString(); - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/Tranche.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/Tranche.java deleted file mode 100644 index 63bd5f14d..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/Tranche.java +++ /dev/null @@ -1,214 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.variantrecalibration; - -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.text.XReadLines; - -import java.io.*; -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Mar 10, 2011 - */ - -public class Tranche { - private static final int CURRENT_VERSION = 5; - - public double ts, minVQSLod, knownTiTv, novelTiTv; - public int numKnown,numNovel; - public String name; - public VariantRecalibratorArgumentCollection.Mode model; - - int accessibleTruthSites = 0; - int callsAtTruthSites = 0; - - public Tranche(double ts, double minVQSLod, int numKnown, double knownTiTv, int numNovel, double novelTiTv, int accessibleTruthSites, int callsAtTruthSites, VariantRecalibratorArgumentCollection.Mode model) { - this(ts, minVQSLod, numKnown, knownTiTv, numNovel, novelTiTv, accessibleTruthSites, callsAtTruthSites, model, "anonymous"); - } - - public Tranche(double ts, double minVQSLod, int numKnown, double knownTiTv, int numNovel, double novelTiTv, int accessibleTruthSites, int callsAtTruthSites, VariantRecalibratorArgumentCollection.Mode model, String name ) { - this.ts = ts; - this.minVQSLod = minVQSLod; - this.novelTiTv = novelTiTv; - this.numNovel = numNovel; - this.knownTiTv = knownTiTv; - this.numKnown = numKnown; - this.model = model; - this.name = name; - - this.accessibleTruthSites = accessibleTruthSites; - this.callsAtTruthSites = callsAtTruthSites; - - if ( ts < 0.0 || ts > 100.0) - throw new UserException("Target FDR is unreasonable " + ts); - - if ( numKnown < 0 || numNovel < 0) - throw new ReviewedStingException("Invalid tranche - no. variants is < 0 : known " + numKnown + " novel " + numNovel); - - if ( name == null ) - throw new ReviewedStingException("BUG -- name cannot be null"); - } - - private double getTruthSensitivity() { - return accessibleTruthSites > 0 ? callsAtTruthSites / (1.0*accessibleTruthSites) : 0.0; - } - - public static class TrancheTruthSensitivityComparator implements Comparator, Serializable { - @Override - public int compare(final Tranche tranche1, final Tranche tranche2) { - return Double.compare(tranche1.ts, tranche2.ts); - } - } - - @Override - public String toString() { - return String.format("Tranche ts=%.2f minVQSLod=%.4f known=(%d @ %.4f) novel=(%d @ %.4f) truthSites(%d accessible, %d called), name=%s]", - ts, minVQSLod, numKnown, knownTiTv, numNovel, novelTiTv, accessibleTruthSites, callsAtTruthSites, name); - } - - /** - * Returns an appropriately formatted string representing the raw tranches file on disk. - * - * @param tranches - * @return - */ - public static String tranchesString( final List tranches ) { - final ByteArrayOutputStream bytes = new ByteArrayOutputStream(); - final PrintStream stream = new PrintStream(bytes); - - Collections.sort( tranches, new TrancheTruthSensitivityComparator() ); - - stream.println("# Variant quality score tranches file"); - stream.println("# Version number " + CURRENT_VERSION); - stream.println("targetTruthSensitivity,numKnown,numNovel,knownTiTv,novelTiTv,minVQSLod,filterName,model,accessibleTruthSites,callsAtTruthSites,truthSensitivity"); - - Tranche prev = null; - for ( Tranche t : tranches ) { - stream.printf("%.2f,%d,%d,%.4f,%.4f,%.4f,VQSRTranche%s%.2fto%.2f,%s,%d,%d,%.4f%n", - t.ts, t.numKnown, t.numNovel, t.knownTiTv, t.novelTiTv, t.minVQSLod, t.model.toString(), - (prev == null ? 0.0 : prev.ts), t.ts, t.model.toString(), t.accessibleTruthSites, t.callsAtTruthSites, t.getTruthSensitivity()); - prev = t; - } - - return bytes.toString(); - } - - private static double getDouble(Map bindings, String key, boolean required) { - if ( bindings.containsKey(key) ) { - String val = bindings.get(key); - return Double.valueOf(val); - } - else if ( required ) { - throw new UserException.MalformedFile("Malformed tranches file. Missing required key " + key); - } - else - return -1; - } - - private static int getInteger(Map bindings, String key, boolean required) { - if ( bindings.containsKey(key) ) - return Integer.valueOf(bindings.get(key)); - else if ( required ) { - throw new UserException.MalformedFile("Malformed tranches file. Missing required key " + key); - } - else - return -1; - } - - /** - * Returns a list of tranches, sorted from most to least specific, read in from file f - * - * @param f - * @return - */ - public static List readTranches(File f) { - String[] header = null; - List tranches = new ArrayList(); - - try { - for( final String line : new XReadLines(f) ) { - if ( line.startsWith("#") ) - continue; - - final String[] vals = line.split(","); - if( header == null ) { - header = vals; - if ( header.length == 5 || header.length == 8 || header.length == 10 ) - // old style tranches file, throw an error - throw new UserException.MalformedFile(f, "Unfortunately your tranches file is from a previous version of this tool and cannot be used with the latest code. Please rerun VariantRecalibrator"); - if ( header.length != 11 ) - throw new UserException.MalformedFile(f, "Expected 11 elements in header line " + line); - } else { - if ( header.length != vals.length ) - throw new UserException.MalformedFile(f, "Line had too few/many fields. Header = " + header.length + " vals " + vals.length + ". The line was: " + line); - - Map bindings = new HashMap(); - for ( int i = 0; i < vals.length; i++ ) bindings.put(header[i], vals[i]); - tranches.add(new Tranche(getDouble(bindings,"targetTruthSensitivity", true), - getDouble(bindings,"minVQSLod", true), - getInteger(bindings,"numKnown", false), - getDouble(bindings,"knownTiTv", false), - getInteger(bindings,"numNovel", true), - getDouble(bindings,"novelTiTv", true), - getInteger(bindings,"accessibleTruthSites", false), - getInteger(bindings,"callsAtTruthSites", false), - VariantRecalibratorArgumentCollection.parseString(bindings.get("model")), - bindings.get("filterName"))); - } - } - - Collections.sort( tranches, new TrancheTruthSensitivityComparator() ); - return tranches; - } catch( FileNotFoundException e ) { - throw new UserException.CouldNotReadInputFile(f, e); - } - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java deleted file mode 100644 index 5da7b4219..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ /dev/null @@ -1,564 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.variantrecalibration; - -import org.broadinstitute.sting.commandline.*; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.PartitionBy; -import org.broadinstitute.sting.gatk.walkers.PartitionType; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.gatk.walkers.TreeReducible; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.R.RScriptExecutor; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.vcf.VCFHeader; -import org.broadinstitute.variant.vcf.VCFHeaderLine; -import org.broadinstitute.sting.utils.collections.ExpandingArrayList; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.io.Resource; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.PrintStream; -import java.util.*; - -/** - * Create a Gaussian mixture model by looking at the annotations values over a high quality subset of the input call set and then evaluate all input variants. - * - *

- * This walker is the first pass in a two-stage processing step. This walker is designed to be used in conjunction with the ApplyRecalibration walker. - *

- * - *

- * The purpose of the variant recalibrator is to assign a well-calibrated probability to each variant call in a call set. - * You can then create highly accurate call sets by filtering based on this single estimate for the accuracy of each call. - * The approach taken by variant quality score recalibration is to develop a continuous, covarying estimate of the relationship - * between SNP call annotations (QD, MQ, HaplotypeScore, and ReadPosRankSum, for example) and the probability that a SNP is a true genetic - * variant versus a sequencing or data processing artifact. This model is determined adaptively based on "true sites" provided - * as input, typically HapMap 3 sites and those sites found to be polymorphic on the Omni 2.5M SNP chip array. This adaptive - * error model can then be applied to both known and novel variation discovered in the call set of interest to evaluate the - * probability that each call is real. The score that gets added to the INFO field of each variant is called the VQSLOD. It is - * the log odds ratio of being a true variant versus being false under the trained Gaussian mixture model. - *

- * - *

Inputs

- *

- * The input raw variants to be recalibrated. - *

- * Known, truth, and training sets to be used by the algorithm. How these various sets are used is described below. - * - *

Output

- *

- * A recalibration table file in VCF format that is used by the ApplyRecalibration walker. - *

- * A tranches file which shows various metrics of the recalibration callset as a function of making several slices through the data. - * - *

Example

- *
- * java -Xmx4g -jar GenomeAnalysisTK.jar \
- *   -T VariantRecalibrator \
- *   -R reference/human_g1k_v37.fasta \
- *   -input NA12878.HiSeq.WGS.bwa.cleaned.raw.subset.b37.vcf \
- *   -resource:hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.b37.sites.vcf \
- *   -resource:omni,known=false,training=true,truth=false,prior=12.0 1000G_omni2.5.b37.sites.vcf \
- *   -resource:dbsnp,known=true,training=false,truth=false,prior=6.0 dbsnp_135.b37.vcf \
- *   -an QD -an HaplotypeScore -an MQRankSum -an ReadPosRankSum -an FS -an MQ -an InbreedingCoeff \
- *   -mode SNP \
- *   -recalFile path/to/output.recal \
- *   -tranchesFile path/to/output.tranches \
- *   -rscriptFile path/to/output.plots.R
- * 
- * - *

Caveat

- * - *
    - *
  • The values used in the example above are only meant to show how the command lines are composed. - * They are not meant to be taken as specific recommendations of values to use in your own work, and they may be - * different from the values cited elsewhere in our documentation. For the latest and greatest recommendations on - * how to set parameter values for you own analyses, please read the Best Practices section of the documentation.
  • - * - *
  • In order to create the model reporting plots Rscript needs to be in your environment PATH (this is the scripting version of R, not the interactive version). - * See http://www.r-project.org for more info on how to download and install R.
  • - *
- */ - -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} ) -@PartitionBy(PartitionType.NONE) -public class VariantRecalibrator extends RodWalker, ExpandingArrayList> implements TreeReducible> { - - public static final String VQS_LOD_KEY = "VQSLOD"; // Log odds ratio of being a true variant versus being false under the trained gaussian mixture model - public static final String CULPRIT_KEY = "culprit"; // The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out - public static final String NEGATIVE_LABEL_KEY = "NEGATIVE_TRAIN_SITE"; // this variant was used in the negative training set - public static final String POSITIVE_LABEL_KEY = "POSITIVE_TRAIN_SITE"; // this variant was used in the positive training set - private static final String PLOT_TRANCHES_RSCRIPT = "plot_Tranches.R"; - - @ArgumentCollection private VariantRecalibratorArgumentCollection VRAC = new VariantRecalibratorArgumentCollection(); - - ///////////////////////////// - // Inputs - ///////////////////////////// - /** - * These calls should be unfiltered and annotated with the error covariates that are intended to be used for modeling. - */ - @Input(fullName="input", shortName = "input", doc="The raw input variants to be recalibrated", required=true) - public List> input; - - /** - * These additional calls should be unfiltered and annotated with the error covariates that are intended to be used for modeling. - */ - @Input(fullName="aggregate", shortName = "aggregate", doc="Additional raw input variants to be used in building the model", required=false) - public List> aggregate; - - /** - * Any set of VCF files to use as lists of training, truth, or known sites. - * Training - Input variants which are found to overlap with these training sites are used to build the Gaussian mixture model. - * Truth - When deciding where to set the cutoff in VQSLOD sensitivity to these truth sites is used. - * Known - The known / novel status of a variant isn't used by the algorithm itself and is only used for reporting / display purposes. - * Bad - In addition to using the set of worst ranked variants as compared to the Gaussian mixture model (see -numBad argument), we can also supplement the list with a database of known bad variants. - */ - @Input(fullName="resource", shortName = "resource", doc="A list of sites for which to apply a prior probability of being correct but which aren't used by the algorithm (training and truth sets are required to run)", required=true) - public List> resource = Collections.emptyList(); - - ///////////////////////////// - // Outputs - ///////////////////////////// - @Output(fullName="recal_file", shortName="recalFile", doc="The output recal file used by ApplyRecalibration", required=true) - protected VariantContextWriter recalWriter = null; - - @Output(fullName="tranches_file", shortName="tranchesFile", doc="The output tranches file used by ApplyRecalibration", required=true) - protected File TRANCHES_FILE; - - ///////////////////////////// - // Additional Command Line Arguments - ///////////////////////////// - /** - * The expected transition / transversion ratio of true novel variants in your targeted region (whole genome, exome, specific - * genes), which varies greatly by the CpG and GC content of the region. See expected Ti/Tv ratios section of the GATK best - * practices documentation (http://www.broadinstitute.org/gatk/guide/best-practices) for more information. - * Normal values are 2.15 for human whole genome values and 3.2 for human whole exomes. Note - * that this parameter is used for display purposes only and isn't used anywhere in the algorithm! - */ - @Argument(fullName="target_titv", shortName="titv", doc="The expected novel Ti/Tv ratio to use when calculating FDR tranches and for display on the optimization curve output figures. (approx 2.15 for whole genome experiments). ONLY USED FOR PLOTTING PURPOSES!", required=false) - protected double TARGET_TITV = 2.15; - - /** - * See the input VCF file's INFO field for a list of all available annotations. - */ - @Argument(fullName="use_annotation", shortName="an", doc="The names of the annotations which should used for calculations", required=true) - private String[] USE_ANNOTATIONS = null; - - /** - * Add truth sensitivity slices through the call set at the given values. The default values are 100.0, 99.9, 99.0, and 90.0 - * which will result in 4 estimated tranches in the final call set: the full set of calls (100% sensitivity at the accessible - * sites in the truth set), a 99.9% truth sensitivity tranche, along with progressively smaller tranches at 99% and 90%. - */ - @Argument(fullName="TStranche", shortName="tranche", doc="The levels of novel false discovery rate (FDR, implied by ti/tv) at which to slice the data. (in percent, that is 1.0 for 1 percent)", required=false) - private double[] TS_TRANCHES = new double[] {100.0, 99.9, 99.0, 90.0}; - /** - * For this to work properly, the -ignoreFilter argument should also be applied to the ApplyRecalibration command. - */ - @Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified, the variant recalibrator will also use variants marked as filtered by the specified filter name in the input VCF file", required=false) - private String[] IGNORE_INPUT_FILTERS = null; - @Output(fullName="rscript_file", shortName="rscriptFile", doc="The output rscript file generated by the VQSR to aid in visualization of the input data and learned model", required=false, defaultToStdout=false) - private File RSCRIPT_FILE = null; - - @Hidden - @Argument(fullName="replicate", shortName="replicate", doc="Used to debug the random number generation inside the VQSR. Do not use.", required=false) - protected int REPLICATE = 200; - private ArrayList replicate = new ArrayList<>(); - - ///////////////////////////// - // Debug Arguments - ///////////////////////////// - @Advanced - @Argument(fullName = "trustAllPolymorphic", shortName = "allPoly", doc = "Trust that all the input training sets' unfiltered records contain only polymorphic sites to drastically speed up the computation.", required = false) - protected Boolean TRUST_ALL_POLYMORPHIC = false; - - ///////////////////////////// - // Private Member Variables - ///////////////////////////// - private VariantDataManager dataManager; - private PrintStream tranchesStream; - private final Set ignoreInputFilterSet = new TreeSet<>(); - private final VariantRecalibratorEngine engine = new VariantRecalibratorEngine( VRAC ); - - //--------------------------------------------------------------------------------------------------------------- - // - // initialize - // - //--------------------------------------------------------------------------------------------------------------- - - @Override - public void initialize() { - dataManager = new VariantDataManager( new ArrayList<>(Arrays.asList(USE_ANNOTATIONS)), VRAC ); - - if (RSCRIPT_FILE != null && !RScriptExecutor.RSCRIPT_EXISTS) - Utils.warnUser(logger, String.format( - "Rscript not found in environment path. %s will be generated but PDF plots will not.", - RSCRIPT_FILE)); - - if( IGNORE_INPUT_FILTERS != null ) { - ignoreInputFilterSet.addAll( Arrays.asList(IGNORE_INPUT_FILTERS) ); - } - - try { - tranchesStream = new PrintStream(TRANCHES_FILE); - } catch (FileNotFoundException e) { - throw new UserException.CouldNotCreateOutputFile(TRANCHES_FILE, e); - } - - for( RodBinding rod : resource ) { - dataManager.addTrainingSet( new TrainingSet( rod ) ); - } - - if( !dataManager.checkHasTrainingSet() ) { - throw new UserException.CommandLineException( "No training set found! Please provide sets of known polymorphic loci marked with the training=true ROD binding tag. For example, -resource:hapmap,VCF,known=false,training=true,truth=true,prior=12.0 hapmapFile.vcf" ); - } - if( !dataManager.checkHasTruthSet() ) { - throw new UserException.CommandLineException( "No truth set found! Please provide sets of known polymorphic loci marked with the truth=true ROD binding tag. For example, -resource:hapmap,VCF,known=false,training=true,truth=true,prior=12.0 hapmapFile.vcf" ); - } - - - final Set hInfo = new HashSet<>(); - ApplyRecalibration.addVQSRStandardHeaderLines(hInfo); - recalWriter.writeHeader( new VCFHeader(hInfo) ); - - for( int iii = 0; iii < REPLICATE * 2; iii++ ) { - replicate.add(GenomeAnalysisEngine.getRandomGenerator().nextDouble()); - } - } - - //--------------------------------------------------------------------------------------------------------------- - // - // map - // - //--------------------------------------------------------------------------------------------------------------- - - @Override - public ExpandingArrayList map( final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context ) { - final ExpandingArrayList mapList = new ExpandingArrayList<>(); - - if( tracker == null ) { // For some reason RodWalkers get map calls with null trackers - return mapList; - } - - mapList.addAll( addOverlappingVariants(input, true, tracker, context) ); - if( aggregate != null ) { - mapList.addAll( addOverlappingVariants(aggregate, false, tracker, context) ); - } - - return mapList; - } - - /** - * Using the RefMetaDataTracker find overlapping variants and pull out the necessary information to create the VariantDatum - * @param rods the rods to search within - * @param isInput is this rod an -input rod? - * @param tracker the RefMetaDataTracker from the RODWalker map call - * @param context the AlignmentContext from the RODWalker map call - * @return a list of VariantDatums, can be empty - */ - private List addOverlappingVariants( final List> rods, final boolean isInput, final RefMetaDataTracker tracker, final AlignmentContext context ) { - if( rods == null ) { throw new IllegalArgumentException("rods cannot be null."); } - if( tracker == null ) { throw new IllegalArgumentException("tracker cannot be null."); } - if( context == null ) { throw new IllegalArgumentException("context cannot be null."); } - - final ExpandingArrayList variants = new ExpandingArrayList<>(); - - for( final VariantContext vc : tracker.getValues(rods, context.getLocation()) ) { - if( vc != null && ( vc.isNotFiltered() || ignoreInputFilterSet.containsAll(vc.getFilters()) ) ) { - if( VariantDataManager.checkVariationClass( vc, VRAC.MODE ) ) { - final VariantDatum datum = new VariantDatum(); - - // Populate the datum with lots of fields from the VariantContext, unfortunately the VC is too big so we just pull in only the things we absolutely need. - dataManager.decodeAnnotations( datum, vc, true ); //BUGBUG: when run with HierarchicalMicroScheduler this is non-deterministic because order of calls depends on load of machine - datum.loc = ( isInput ? getToolkit().getGenomeLocParser().createGenomeLoc(vc) : null ); - datum.originalQual = vc.getPhredScaledQual(); - datum.isSNP = vc.isSNP() && vc.isBiallelic(); - datum.isTransition = datum.isSNP && GATKVariantContextUtils.isTransition(vc); - datum.isAggregate = !isInput; - - // Loop through the training data sets and if they overlap this loci then update the prior and training status appropriately - dataManager.parseTrainingSets( tracker, context.getLocation(), vc, datum, TRUST_ALL_POLYMORPHIC ); - final double priorFactor = QualityUtils.qualToProb( datum.prior ); - datum.prior = Math.log10( priorFactor ) - Math.log10( 1.0 - priorFactor ); - - variants.add( datum ); - } - } - } - - return variants; - } - - //--------------------------------------------------------------------------------------------------------------- - // - // reduce - // - //--------------------------------------------------------------------------------------------------------------- - - @Override - public ExpandingArrayList reduceInit() { - return new ExpandingArrayList<>(); - } - - @Override - public ExpandingArrayList reduce( final ExpandingArrayList mapValue, final ExpandingArrayList reduceSum ) { - reduceSum.addAll( mapValue ); - return reduceSum; - } - - @Override - public ExpandingArrayList treeReduce( final ExpandingArrayList lhs, final ExpandingArrayList rhs ) { - rhs.addAll( lhs ); - return rhs; - } - - //--------------------------------------------------------------------------------------------------------------- - // - // on traversal done - // - //--------------------------------------------------------------------------------------------------------------- - - @Override - public void onTraversalDone( final ExpandingArrayList reduceSum ) { - dataManager.setData( reduceSum ); - dataManager.normalizeData(); // Each data point is now (x - mean) / standard deviation - - // Generate the positive model using the training data and evaluate each variant - final List positiveTrainingData = dataManager.getTrainingData(); - final GaussianMixtureModel goodModel = engine.generateModel( positiveTrainingData, VRAC.MAX_GAUSSIANS ); - engine.evaluateData( dataManager.getData(), goodModel, false ); - - // Generate the negative model using the worst performing data and evaluate each variant contrastively - final List negativeTrainingData = dataManager.selectWorstVariants(); - final GaussianMixtureModel badModel = engine.generateModel( negativeTrainingData, Math.min(VRAC.MAX_GAUSSIANS_FOR_NEGATIVE_MODEL, VRAC.MAX_GAUSSIANS)); - dataManager.dropAggregateData(); // Don't need the aggregate data anymore so let's free up the memory - engine.evaluateData( dataManager.getData(), badModel, true ); - - if( badModel.failedToConverge || goodModel.failedToConverge ) { - throw new UserException("NaN LOD value assigned. Clustering with this few variants and these annotations is unsafe. Please consider " + (badModel.failedToConverge ? "raising the number of variants used to train the negative model (via --minNumBadVariants 5000, for example)." : "lowering the maximum number of Gaussians allowed for use in the model (via --maxGaussians 4, for example).") ); - } - - engine.calculateWorstPerformingAnnotation( dataManager.getData(), goodModel, badModel ); - - // Find the VQSLOD cutoff values which correspond to the various tranches of calls requested by the user - final int nCallsAtTruth = TrancheManager.countCallsAtTruth( dataManager.getData(), Double.NEGATIVE_INFINITY ); - final TrancheManager.SelectionMetric metric = new TrancheManager.TruthSensitivityMetric( nCallsAtTruth ); - final List tranches = TrancheManager.findTranches( dataManager.getData(), TS_TRANCHES, metric, VRAC.MODE ); - tranchesStream.print(Tranche.tranchesString( tranches )); - - logger.info( "Writing out recalibration table..." ); - dataManager.writeOutRecalibrationTable( recalWriter ); - if( RSCRIPT_FILE != null ) { - logger.info( "Writing out visualization Rscript file..."); - createVisualizationScript( dataManager.getRandomDataForPlotting( 1000, positiveTrainingData, negativeTrainingData, dataManager.getEvaluationData() ), goodModel, badModel, 0.0, dataManager.getAnnotationKeys().toArray(new String[USE_ANNOTATIONS.length]) ); - } - - if(VRAC.MODE == VariantRecalibratorArgumentCollection.Mode.INDEL) { - // Print out an info message to make it clear why the tranches plot is not generated - logger.info("Tranches plot will not be generated since we are running in INDEL mode"); - } else { - // Execute the RScript command to plot the table of truth values - RScriptExecutor executor = new RScriptExecutor(); - executor.addScript(new Resource(PLOT_TRANCHES_RSCRIPT, VariantRecalibrator.class)); - executor.addArgs(TRANCHES_FILE.getAbsoluteFile(), TARGET_TITV); - // Print out the command line to make it clear to the user what is being executed and how one might modify it - logger.info("Executing: " + executor.getApproximateCommandLine()); - executor.exec(); - } - } - - private void createVisualizationScript( final List randomData, final GaussianMixtureModel goodModel, final GaussianMixtureModel badModel, final double lodCutoff, final String[] annotationKeys ) { - PrintStream stream; - try { - stream = new PrintStream(RSCRIPT_FILE); - } catch( FileNotFoundException e ) { - throw new UserException.CouldNotCreateOutputFile(RSCRIPT_FILE, e); - } - - // We make extensive use of the ggplot2 R library: http://had.co.nz/ggplot2/ - stream.println("library(ggplot2)"); - // For compactPDF in R 2.13+ - stream.println("library(tools)"); - // For graphical functions R 2.14.2+ - stream.println("library(grid)"); - - createArrangeFunction( stream ); - - stream.println("outputPDF <- \"" + RSCRIPT_FILE + ".pdf\""); - stream.println("pdf(outputPDF)"); // Unfortunately this is a huge pdf file, BUGBUG: need to work on reducing the file size - - for(int iii = 0; iii < annotationKeys.length; iii++) { - for( int jjj = iii + 1; jjj < annotationKeys.length; jjj++) { - logger.info( "Building " + annotationKeys[iii] + " x " + annotationKeys[jjj] + " plot..."); - - final List fakeData = new ExpandingArrayList<>(); - double minAnn1 = 100.0, maxAnn1 = -100.0, minAnn2 = 100.0, maxAnn2 = -100.0; - for( final VariantDatum datum : randomData ) { - minAnn1 = Math.min(minAnn1, datum.annotations[iii]); - maxAnn1 = Math.max(maxAnn1, datum.annotations[iii]); - minAnn2 = Math.min(minAnn2, datum.annotations[jjj]); - maxAnn2 = Math.max(maxAnn2, datum.annotations[jjj]); - } - // Create a fake set of data which spans the full extent of these two annotation dimensions in order to calculate the model PDF projected to 2D - final double NUM_STEPS = 60.0; - for(double ann1 = minAnn1; ann1 <= maxAnn1; ann1+= (maxAnn1 - minAnn1) / NUM_STEPS) { - for(double ann2 = minAnn2; ann2 <= maxAnn2; ann2+= (maxAnn2 - minAnn2) / NUM_STEPS) { - final VariantDatum datum = new VariantDatum(); - datum.prior = 0.0; - datum.annotations = new double[randomData.get(0).annotations.length]; - datum.isNull = new boolean[randomData.get(0).annotations.length]; - for(int ann=0; ann< datum.annotations.length; ann++) { - datum.annotations[ann] = 0.0; - datum.isNull[ann] = true; - } - datum.annotations[iii] = ann1; - datum.annotations[jjj] = ann2; - datum.isNull[iii] = false; - datum.isNull[jjj] = false; - fakeData.add(datum); - } - } - - engine.evaluateData( fakeData, goodModel, false ); - engine.evaluateData( fakeData, badModel, true ); - - stream.print("surface <- c("); - for( final VariantDatum datum : fakeData ) { - stream.print(String.format("%.4f, %.4f, %.4f, ", - dataManager.denormalizeDatum(datum.annotations[iii], iii), - dataManager.denormalizeDatum(datum.annotations[jjj], jjj), - Math.min(4.0, Math.max(-4.0, datum.lod)))); - } - stream.println("NA,NA,NA)"); - stream.println("s <- matrix(surface,ncol=3,byrow=T)"); - - stream.print("data <- c("); - for( final VariantDatum datum : randomData ) { - stream.print(String.format("%.4f, %.4f, %.4f, %d, %d,", - dataManager.denormalizeDatum(datum.annotations[iii], iii), - dataManager.denormalizeDatum(datum.annotations[jjj], jjj), - (datum.lod < lodCutoff ? -1.0 : 1.0), - (datum.atAntiTrainingSite ? -1 : (datum.atTrainingSite ? 1 : 0)), (datum.isKnown ? 1 : -1))); - } - stream.println("NA,NA,NA,NA,1)"); - stream.println("d <- matrix(data,ncol=5,byrow=T)"); - - final String surfaceFrame = "sf." + annotationKeys[iii] + "." + annotationKeys[jjj]; - final String dataFrame = "df." + annotationKeys[iii] + "." + annotationKeys[jjj]; - - stream.println(surfaceFrame + " <- data.frame(x=s[,1], y=s[,2], lod=s[,3])"); - stream.println(dataFrame + " <- data.frame(x=d[,1], y=d[,2], retained=d[,3], training=d[,4], novelty=d[,5])"); - stream.println("dummyData <- " + dataFrame + "[1,]"); - stream.println("dummyData$x <- NaN"); - stream.println("dummyData$y <- NaN"); - stream.println("p <- ggplot(data=" + surfaceFrame + ", aes(x=x, y=y)) + opts(panel.background = theme_rect(colour = NA), panel.grid.minor = theme_line(colour = NA), panel.grid.major = theme_line(colour = NA))"); - stream.println("p1 = p + opts(title=\"model PDF\") + labs(x=\""+ annotationKeys[iii] +"\", y=\""+ annotationKeys[jjj] +"\") + geom_tile(aes(fill = lod)) + scale_fill_gradient(high=\"green\", low=\"red\")"); - stream.println("p <- qplot(x,y,data=" + dataFrame + ", color=retained, alpha=I(1/7),legend=FALSE) + opts(panel.background = theme_rect(colour = NA), panel.grid.minor = theme_line(colour = NA), panel.grid.major = theme_line(colour = NA))"); - stream.println("q <- geom_point(aes(x=x,y=y,color=retained),data=dummyData, alpha=1.0, na.rm=TRUE)"); - stream.println("p2 = p + q + labs(x=\""+ annotationKeys[iii] +"\", y=\""+ annotationKeys[jjj] +"\") + scale_colour_gradient(name=\"outcome\", high=\"black\", low=\"red\",breaks=c(-1,1),labels=c(\"filtered\",\"retained\"))"); - stream.println("p <- qplot(x,y,data="+ dataFrame + "["+dataFrame+"$training != 0,], color=training, alpha=I(1/7)) + opts(panel.background = theme_rect(colour = NA), panel.grid.minor = theme_line(colour = NA), panel.grid.major = theme_line(colour = NA))"); - stream.println("q <- geom_point(aes(x=x,y=y,color=training),data=dummyData, alpha=1.0, na.rm=TRUE)"); - stream.println("p3 = p + q + labs(x=\""+ annotationKeys[iii] +"\", y=\""+ annotationKeys[jjj] +"\") + scale_colour_gradient(high=\"green\", low=\"purple\",breaks=c(-1,1), labels=c(\"neg\", \"pos\"))"); - stream.println("p <- qplot(x,y,data=" + dataFrame + ", color=novelty, alpha=I(1/7)) + opts(panel.background = theme_rect(colour = NA), panel.grid.minor = theme_line(colour = NA), panel.grid.major = theme_line(colour = NA))"); - stream.println("q <- geom_point(aes(x=x,y=y,color=novelty),data=dummyData, alpha=1.0, na.rm=TRUE)"); - stream.println("p4 = p + q + labs(x=\""+ annotationKeys[iii] +"\", y=\""+ annotationKeys[jjj] +"\") + scale_colour_gradient(name=\"novelty\", high=\"blue\", low=\"red\",breaks=c(-1,1), labels=c(\"novel\",\"known\"))"); - stream.println("arrange(p1, p2, p3, p4, ncol=2)"); - } - } - stream.println("dev.off()"); - - stream.println("if (exists(\"compactPDF\")) {"); - stream.println("compactPDF(outputPDF)"); - stream.println("}"); - - stream.close(); - - // Execute Rscript command to generate the clustering plots - RScriptExecutor executor = new RScriptExecutor(); - executor.addScript(RSCRIPT_FILE); - logger.info("Executing: " + executor.getApproximateCommandLine()); - executor.exec(); - } - - // The Arrange function is how we place the 4 model plots on one page - // from http://gettinggeneticsdone.blogspot.com/2010/03/arrange-multiple-ggplot2-plots-in-same.html - private void createArrangeFunction( final PrintStream stream ) { - stream.println("vp.layout <- function(x, y) viewport(layout.pos.row=x, layout.pos.col=y)"); - stream.println("arrange <- function(..., nrow=NULL, ncol=NULL, as.table=FALSE) {"); - stream.println("dots <- list(...)"); - stream.println("n <- length(dots)"); - stream.println("if(is.null(nrow) & is.null(ncol)) { nrow = floor(n/2) ; ncol = ceiling(n/nrow)}"); - stream.println("if(is.null(nrow)) { nrow = ceiling(n/ncol)}"); - stream.println("if(is.null(ncol)) { ncol = ceiling(n/nrow)}"); - stream.println("grid.newpage()"); - stream.println("pushViewport(viewport(layout=grid.layout(nrow,ncol) ) )"); - stream.println("ii.p <- 1"); - stream.println("for(ii.row in seq(1, nrow)){"); - stream.println("ii.table.row <- ii.row "); - stream.println("if(as.table) {ii.table.row <- nrow - ii.table.row + 1}"); - stream.println("for(ii.col in seq(1, ncol)){"); - stream.println("ii.table <- ii.p"); - stream.println("if(ii.p > n) break"); - stream.println("print(dots[[ii.table]], vp=vp.layout(ii.table.row, ii.col))"); - stream.println("ii.p <- ii.p + 1"); - stream.println("}"); - stream.println("}"); - stream.println("}"); - } -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorArgumentCollection.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorArgumentCollection.java deleted file mode 100644 index b501655f8..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorArgumentCollection.java +++ /dev/null @@ -1,120 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.variantrecalibration; - -import org.broadinstitute.sting.commandline.Advanced; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Mar 4, 2011 - */ - -public class VariantRecalibratorArgumentCollection { - - public enum Mode { - SNP, - INDEL, - BOTH - } - - static Mode parseString(final String input) { - if( input.equals("SNP") ) { return Mode.SNP; } - if( input.equals("INDEL") ) { return Mode.INDEL; } - if( input.equals("BOTH") ) { return Mode.BOTH; } - throw new ReviewedStingException("VariantRecalibrator mode string is unrecognized, input = " + input); - } - - @Argument(fullName = "mode", shortName = "mode", doc = "Recalibration mode to employ: 1.) SNP for recalibrating only SNPs (emitting indels untouched in the output VCF); 2.) INDEL for indels (emitting SNPs untouched in the output VCF); and 3.) BOTH for recalibrating both SNPs and indels simultaneously (for testing purposes only, not recommended for general use).", required = false) - public VariantRecalibratorArgumentCollection.Mode MODE = VariantRecalibratorArgumentCollection.Mode.SNP; - - @Advanced - @Argument(fullName="maxGaussians", shortName="mG", doc="The maximum number of Gaussians for the positive model to try during variational Bayes algorithm.", required=false) - public int MAX_GAUSSIANS = 8; - - @Advanced - @Argument(fullName="maxNegativeGaussians", shortName="mNG", doc="The maximum number of Gaussians for the negative model to try during variational Bayes algorithm. The actual maximum used is the min of the mG and mNG arguments. Note that this number should be small (like 4) to achieve the best results", required=false) - public int MAX_GAUSSIANS_FOR_NEGATIVE_MODEL = 2; - - @Advanced - @Argument(fullName="maxIterations", shortName="mI", doc="The maximum number of VBEM iterations to be performed in variational Bayes algorithm. Procedure will normally end when convergence is detected.", required=false) - public int MAX_ITERATIONS = 150; - - @Advanced - @Argument(fullName="numKMeans", shortName="nKM", doc="The number of k-means iterations to perform in order to initialize the means of the Gaussians in the Gaussian mixture model.", required=false) - public int NUM_KMEANS_ITERATIONS = 100; - - @Advanced - @Argument(fullName="stdThreshold", shortName="std", doc="If a variant has annotations more than -std standard deviations away from mean then don't use it for building the Gaussian mixture model.", required=false) - public double STD_THRESHOLD = 10.0; - - @Advanced - @Argument(fullName="shrinkage", shortName="shrinkage", doc="The shrinkage parameter in the variational Bayes algorithm.", required=false) - public double SHRINKAGE = 1.0; - - @Advanced - @Argument(fullName="dirichlet", shortName="dirichlet", doc="The dirichlet parameter in the variational Bayes algorithm.", required=false) - public double DIRICHLET_PARAMETER = 0.001; - - @Advanced - @Argument(fullName="priorCounts", shortName="priorCounts", doc="The number of prior counts to use in the variational Bayes algorithm.", required=false) - public double PRIOR_COUNTS = 20.0; - - @Advanced - @Argument(fullName="maxNumTrainingData", shortName="maxNumTrainingData", doc="Maximum number of training data to be used in building the Gaussian mixture model. Training sets large than this will be randomly downsampled.", required=false) - protected int MAX_NUM_TRAINING_DATA = 2500000; - - @Advanced - @Argument(fullName="minNumBadVariants", shortName="minNumBad", doc="The minimum number of worst scoring variants to use when building the Gaussian mixture model of bad variants.", required=false) - public int MIN_NUM_BAD_VARIANTS = 1000; - - @Advanced - @Argument(fullName="badLodCutoff", shortName="badLodCutoff", doc="The LOD score below which to be used when building the Gaussian mixture model of bad variants.", required=false) - public double BAD_LOD_CUTOFF = -5.0; -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineReferenceCalculationVariants.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineReferenceCalculationVariants.java deleted file mode 100644 index a587b0250..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineReferenceCalculationVariants.java +++ /dev/null @@ -1,228 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.variantutils; - -import org.broadinstitute.sting.commandline.*; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.Reference; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.gatk.walkers.TreeReducible; -import org.broadinstitute.sting.gatk.walkers.Window; -import org.broadinstitute.sting.gatk.walkers.annotator.ChromosomeCountConstants; -import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; -import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.sting.utils.variant.GATKVCFUtils; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; -import org.broadinstitute.variant.vcf.*; - -import java.util.*; - -/** - * Combines gVCF records that were produced by the Haplotype Caller from single sample sources. - * - *

- * CombineReferenceCalculationVariants combines gVCF records that were produced as part of the "single sample discovery" - * pipeline using the '-ERC GVCF' mode of the Haplotype Caller. This tools performs the multi-sample joint aggregation - * step and merges the records together in a sophisticated manner. - * - * At all positions of the target, this tool will combine all spanning records, produce correct genotype likelihoods, - * re-genotype the newly merged record, and then re-annotate it. - * - * - *

Input

- *

- * One or more Haplotype Caller gVCFs to combine. - *

- * - *

Output

- *

- * A combined VCF. - *

- * - *

Examples

- *
- * java -Xmx2g -jar GenomeAnalysisTK.jar \
- *   -R ref.fasta \
- *   -T CombineReferenceCalculationVariants \
- *   --variant input1.vcf \
- *   --variant input2.vcf \
- *   -o output.vcf
- * 
- * - */ -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) -@Reference(window=@Window(start=-10,stop=10)) -public class CombineReferenceCalculationVariants extends RodWalker implements AnnotatorCompatible, TreeReducible { - - // TODO -- allow a file of VCF paths to be entered? - - /** - * The VCF files to merge together - */ - @Input(fullName="variant", shortName = "V", doc="One or more input VCF files", required=true) - public List> variants; - - @Output(doc="File to which variants should be written") - protected VariantContextWriter vcfWriter = null; - - @Argument(fullName="includeNonVariants", shortName="inv", doc="Include loci found to be non-variant after the combining procedure", required=false) - public boolean INCLUDE_NON_VARIANTS = false; - - /** - * Which annotations to recompute for the combined output VCF file. - */ - @Advanced - @Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to recompute", required=false) - protected List annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"InbreedingCoeff", "FisherStrand", "QualByDepth", "ChromosomeCounts"})); - - /** - * rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate. - * dbSNP is not used in any way for the calculations themselves. - */ - @ArgumentCollection - protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); - public RodBinding getDbsnpRodBinding() { return dbsnp.dbsnp; } - - // the genotyping engine - private UnifiedGenotyperEngine genotypingEngine; - // the annotation engine - private VariantAnnotatorEngine annotationEngine; - - public List> getCompRodBindings() { return Collections.emptyList(); } - public RodBinding getSnpEffRodBinding() { return null; } - public List> getResourceRodBindings() { return Collections.emptyList(); } - public boolean alwaysAppendDbsnpId() { return false; } - - - public void initialize() { - // take care of the VCF headers - final Map vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit()); - final Set samples = SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); - final Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true); - headerLines.addAll(Arrays.asList(ChromosomeCountConstants.descriptions)); - final VCFHeader vcfHeader = new VCFHeader(headerLines, samples); - vcfWriter.writeHeader(vcfHeader); - - // create the genotyping engine - final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection(); - UAC.GLmodel = GenotypeLikelihoodsCalculationModel.Model.BOTH; - UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES; - UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; - genotypingEngine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY); - - // create the annotation engine - annotationEngine = new VariantAnnotatorEngine(Arrays.asList("none"), annotationsToUse, Collections.emptyList(), this, getToolkit()); - } - - public VariantContext map(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { - if ( tracker == null ) // RodWalkers can make funky map calls - return null; - - final GenomeLoc loc = ref.getLocus(); - final VariantContext combinedVC = GATKVariantContextUtils.referenceConfidenceMerge(tracker.getPrioritizedValue(variants, loc), loc, INCLUDE_NON_VARIANTS ? ref.getBase() : null); - if ( combinedVC == null ) - return null; - - return regenotypeVC(tracker, ref, combinedVC); - } - - /** - * Re-genotype (and re-annotate) a combined genomic VC - * - * @param tracker the ref tracker - * @param ref the ref context - * @param combinedVC the combined genomic VC - * @return a new VariantContext or null if the site turned monomorphic and we don't want such sites - */ - protected VariantContext regenotypeVC(final RefMetaDataTracker tracker, final ReferenceContext ref, final VariantContext combinedVC) { - if ( combinedVC == null ) throw new IllegalArgumentException("combinedVC cannot be null"); - - VariantContext result = combinedVC; - - // only re-genotype polymorphic sites - if ( combinedVC.isVariant() ) - result = genotypingEngine.calculateGenotypes(result); - - // if it turned monomorphic and we don't want such sites, quit - if ( !INCLUDE_NON_VARIANTS && result.isMonomorphicInSamples() ) - return null; - - // re-annotate it - return annotationEngine.annotateContext(tracker, ref, null, result); - } - - public VariantContextWriter reduceInit() { - return vcfWriter; - } - - public VariantContextWriter reduce(final VariantContext vc, final VariantContextWriter writer) { - if ( vc != null ) - writer.add(vc); - return writer; - } - - @Override - public VariantContextWriter treeReduce(final VariantContextWriter lhs, final VariantContextWriter rhs) { - return lhs; - } - - @Override - public void onTraversalDone(final VariantContextWriter writer) {} -} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/PosteriorLikelihoodsUtils.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/PosteriorLikelihoodsUtils.java deleted file mode 100644 index c9e4e44f0..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/PosteriorLikelihoodsUtils.java +++ /dev/null @@ -1,261 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.variantutils; - -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.variantcontext.*; -import org.broadinstitute.variant.vcf.VCFConstants; - -import java.util.*; - -public class PosteriorLikelihoodsUtils { - - public static VariantContext calculatePosteriorGLs(final VariantContext vc1, - final Collection resources, - final int numRefSamplesFromMissingResources, - final double globalFrequencyPriorDirichlet, - final boolean useInputSamples, - final boolean useEM, - final boolean useAC) { - if ( useEM ) - throw new IllegalArgumentException("EM loop for posterior GLs not yet implemented"); - - final Map totalAlleleCounts = new HashMap<>(); - for ( final VariantContext resource : resources ) { - addAlleleCounts(totalAlleleCounts,resource,useAC); - } - - if ( useInputSamples ) { - addAlleleCounts(totalAlleleCounts,vc1,useAC); - } - - totalAlleleCounts.put(vc1.getReference(),totalAlleleCounts.get(vc1.getReference())+numRefSamplesFromMissingResources); - - // now extract the counts of the alleles present within vc1, and in order - final double[] alleleCounts = new double[vc1.getNAlleles()]; - int alleleIndex = 0; - for ( final Allele allele : vc1.getAlleles() ) { - - alleleCounts[alleleIndex++] = globalFrequencyPriorDirichlet + ( totalAlleleCounts.containsKey(allele) ? - totalAlleleCounts.get(allele) : 0 ); - } - - final List likelihoods = new ArrayList<>(vc1.getNSamples()); - for ( final Genotype genotype : vc1.getGenotypes() ) { - likelihoods.add(genotype.hasLikelihoods() ? genotype.getLikelihoods().getAsVector() : null ); - } - - final List posteriors = calculatePosteriorGLs(likelihoods,alleleCounts,vc1.getMaxPloidy(2)); - - final GenotypesContext newContext = GenotypesContext.create(); - for ( int genoIdx = 0; genoIdx < vc1.getNSamples(); genoIdx ++ ) { - final GenotypeBuilder builder = new GenotypeBuilder(vc1.getGenotype(genoIdx)); - if ( posteriors.get(genoIdx) != null ) { - GATKVariantContextUtils.updateGenotypeAfterSubsetting(vc1.getAlleles(), builder, - GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, posteriors.get(genoIdx), vc1.getAlleles()); - builder.attribute(VCFConstants.GENOTYPE_POSTERIORS_KEY, - Utils.listFromPrimitives(GenotypeLikelihoods.fromLog10Likelihoods(posteriors.get(genoIdx)).getAsPLs())); - - } - newContext.add(builder.make()); - } - - final List priors = Utils.listFromPrimitives( - GenotypeLikelihoods.fromLog10Likelihoods(getDirichletPrior(alleleCounts, vc1.getMaxPloidy(2))).getAsPLs()); - - return new VariantContextBuilder(vc1).genotypes(newContext).attribute("PG",priors).make(); - } - - /** - * Given genotype likelihoods and known allele counts, calculate the posterior likelihoods - * over the genotype states - * @param genotypeLikelihoods - the genotype likelihoods for the individual - * @param knownAlleleCountsByAllele - the known allele counts in the population. For AC=2 AN=12 site, this is {10,2} - * @param ploidy - the ploidy to assume - * @return - the posterior genotype likelihoods - */ - protected static List calculatePosteriorGLs(final List genotypeLikelihoods, - final double[] knownAlleleCountsByAllele, - final int ploidy) { - if ( ploidy != 2 ) { - throw new IllegalStateException("Genotype posteriors not yet implemented for ploidy != 2"); - } - - final double[] genotypePriorByAllele = getDirichletPrior(knownAlleleCountsByAllele,ploidy); - final List posteriors = new ArrayList<>(genotypeLikelihoods.size()); - for ( final double[] likelihoods : genotypeLikelihoods ) { - double[] posteriorLikelihoods = null; - - if ( likelihoods != null ) { - if ( likelihoods.length != genotypePriorByAllele.length ) { - throw new IllegalStateException(String.format("Likelihoods not of correct size: expected %d, observed %d", - knownAlleleCountsByAllele.length*(knownAlleleCountsByAllele.length+1)/2,likelihoods.length)); - } - - posteriorLikelihoods = new double[genotypePriorByAllele.length]; - for ( int genoIdx = 0; genoIdx < likelihoods.length; genoIdx ++ ) { - posteriorLikelihoods[genoIdx] = likelihoods[genoIdx] + genotypePriorByAllele[genoIdx]; - } - - posteriorLikelihoods = MathUtils.toLog10(MathUtils.normalizeFromLog10(posteriorLikelihoods)); - - } - - posteriors.add(posteriorLikelihoods); - } - - return posteriors; - } - - // convenience function for a single genotypelikelihoods array. Just wraps. - protected static double[] calculatePosteriorGLs(final double[] genotypeLikelihoods, - final double[] knownAlleleCountsByAllele, - final int ploidy) { - return calculatePosteriorGLs(Arrays.asList(genotypeLikelihoods),knownAlleleCountsByAllele,ploidy).get(0); - } - - - /** - * Given known allele counts (whether external, from the sample, or both), calculate the prior distribution - * over genotype states. This assumes - * 1) Random sampling of alleles (known counts are unbiased, and frequency estimate is Dirichlet) - * 2) Genotype states are independent (Hardy-Weinberg) - * These assumptions give rise to a Dirichlet-Multinomial distribution of genotype states as a prior - * (the "number of trials" for the multinomial is simply the ploidy) - * @param knownCountsByAllele - the known counts per allele. For an AC=2, AN=12 site this is {10,2} - * @param ploidy - the number of chromosomes in the sample. For now restricted to 2. - * @return - the Dirichlet-Multinomial distribution over genotype states - */ - protected static double[] getDirichletPrior(final double[] knownCountsByAllele, final int ploidy) { - if ( ploidy != 2 ) { - throw new IllegalStateException("Genotype priors not yet implemented for ploidy != 2"); - } - - // multi-allelic format is - // AA AB BB AC BC CC AD BD CD DD ... - final double sumOfKnownCounts = MathUtils.sum(knownCountsByAllele); - final double[] priors = new double[knownCountsByAllele.length*(knownCountsByAllele.length+1)/2]; - int priorIndex = 0; - for ( int allele2 = 0; allele2 < knownCountsByAllele.length; allele2++ ) { - for ( int allele1 = 0; allele1 <= allele2; allele1++) { - final int[] counts = new int[knownCountsByAllele.length]; - counts[allele1] += 1; - counts[allele2] += 1; - priors[priorIndex++] = MathUtils.dirichletMultinomial(knownCountsByAllele,sumOfKnownCounts,counts,ploidy); - } - } - - return priors; - } - - private static void addAlleleCounts(final Map counts, final VariantContext context, final boolean useAC) { - final int[] ac; - if ( context.hasAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY) && ! useAC ) { - ac = extractInts(context.getAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY)); - } else if ( context.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) { - ac = extractInts(context.getAttribute(VCFConstants.ALLELE_COUNT_KEY)); - } else { - ac = new int[context.getAlternateAlleles().size()]; - int idx = 0; - for ( final Allele allele : context.getAlternateAlleles() ) { - ac[idx++] = context.getCalledChrCount(allele); - } - } - - for ( final Allele allele : context.getAlleles() ) { - final int count; - if ( allele.isReference() ) { - if ( context.hasAttribute(VCFConstants.ALLELE_NUMBER_KEY) ) { - count = context.getAttributeAsInt(VCFConstants.ALLELE_NUMBER_KEY,-1) - (int) MathUtils.sum(ac); - } else { - count = context.getCalledChrCount() - (int) MathUtils.sum(ac); - } - } else { - count = ac[context.getAlternateAlleles().indexOf(allele)]; - } - if ( ! counts.containsKey(allele) ) { - counts.put(allele,0); - } - counts.put(allele,count + counts.get(allele)); - } - } - - public static int[] extractInts(final Object integerListContainingVCField) { - List mleList = null; - if ( integerListContainingVCField instanceof List ) { - if ( ((List) integerListContainingVCField).get(0) instanceof String ) { - mleList = new ArrayList<>(((List) integerListContainingVCField).size()); - for ( Object s : ((List)integerListContainingVCField)) { - mleList.add(Integer.parseInt((String) s)); - } - } else { - mleList = (List) integerListContainingVCField; - } - } else if ( integerListContainingVCField instanceof Integer ) { - mleList = Arrays.asList((Integer) integerListContainingVCField); - } else if ( integerListContainingVCField instanceof String ) { - mleList = Arrays.asList(Integer.parseInt((String)integerListContainingVCField)); - } - if ( mleList == null ) - throw new IllegalArgumentException(String.format("VCF does not have properly formatted "+ - VCFConstants.MLE_ALLELE_COUNT_KEY+" or "+VCFConstants.ALLELE_COUNT_KEY)); - - final int[] mle = new int[mleList.size()]; - - if ( ! ( mleList.get(0) instanceof Integer ) ) { - throw new IllegalStateException("BUG: The AC values should be an Integer, but was "+mleList.get(0).getClass().getCanonicalName()); - } - - for ( int idx = 0; idx < mle.length; idx++) { - mle[idx] = mleList.get(idx); - } - - return mle; - } -} diff --git a/protected/java/src/org/broadinstitute/sting/utils/collections/CountSet.java b/protected/java/src/org/broadinstitute/sting/utils/collections/CountSet.java deleted file mode 100644 index e1de32bf6..000000000 --- a/protected/java/src/org/broadinstitute/sting/utils/collections/CountSet.java +++ /dev/null @@ -1,521 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ -package org.broadinstitute.sting.utils.collections; - -import com.google.java.contract.Requires; -import com.sun.istack.internal.NotNull; - -import java.lang.reflect.Array; -import java.util.*; - -/** - * Efficient implementation for a small set of integer primitive values. - *

- * It includes a increment operation incAll which is convenient when analyzing the read-threading graphs. Nevertheless - * it can be also be used in general purpose. - *

- *

- * It does not provide a O(1) look-up of its elements though. These are kept in a sorted array so look up is implemented - * using a binary search O(log n). Therefore it might not be optimal for problems that require large integer sets. - *

- *

- * Also note that addition can be costly for large sets unless done in order: O(n). - *

- * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> - */ -public class CountSet implements Cloneable, Set { - - /** - * The size of the set. - */ - private int size; - - /** - * Holds the element of the set within the subrange [0 .. size - 1] in ascending order. - */ - private int[] elements; - - /** - * Creates a copy of an existing int-set. - * @param template the intset to copy values from. - */ - public CountSet(final CountSet template) { - elements = template.elements.clone(); - size = template.size; - } - - /** - * Creates a new set indicating the expected maximum number of elements it will contain. - * @param initialCapacity the desired initial capacity of the set. - * @throws IllegalArgumentException if initialCapacity is negative. - */ - public CountSet(int initialCapacity) { - if (initialCapacity < 0) - throw new IllegalArgumentException(); - elements = new int[initialCapacity]; - size = 0; - } - - /** - * Set the set contents to a single integer value. - * @param value the integer value to set the set to. - */ - public void setTo(int value) { - ensureCapacity(1); - size = 1; - elements[0] = value; - } - - /** - * Set the content of this set to a collection of integers. - * @param values the new values to be included in the set. - * @throws NullPointerException if value is null. - */ - public void setTo(int ... values) { - ensureCapacity(values.length); - size = values.length; - System.arraycopy(values, 0, elements, 0, size); - Arrays.sort(elements,0,size); - } - - /** - * Increase (or decrease) all elements in the set by a number. - * @param delta the number of add (or substract if negative) to all elements. - * - * @return true if the set changed as a result of this invocation, false otherwise. - */ - public boolean incAll(final int delta) { - if (size == 0 || delta == 0) - return false; - for (int i = 0; i < size; i++) - elements[i] += delta; - return true; - } - - /** - * Returns the smallest integer value in the set. - * - * @throws NoSuchElementException if the set is empty (thus there is no minimum). - * @return the smallest integer value in the set. - */ - public int min() { - if (size == 0) - throw new NoSuchElementException("cannot have a min from an empty set"); - return elements[0]; - } - - /** - * Returns the largest integer value in the set. - * - * @throws NoSuchElementException if the set is empty (thus there is no maximum). - * @return the largest integer value in the set. - */ - public int max() { - if (size == 0) - throw new NoSuchElementException("cannot have a max from an empty set"); - return elements[size - 1]; - } - - /** - * Adds a range of integer values to the collection. - * - * This method avoid the need to explicity indicate all values in that range. Notice that the range is fully inclusive. - * You can indicate a decrease range (fromValue > toValue). - * - * @param fromValue the first value to add in the set (inclusive). - * @param toValue the last value to add to the set (inclusive). - * @return true if the set changed as a result of this invocation, false otherwise. - */ - public boolean addRange(final int fromValue, final int toValue) { - final int lowEnd; - final int highEnd; - - if (fromValue <= toValue) { - lowEnd = fromValue; highEnd = toValue; - } else { - highEnd = fromValue; lowEnd = toValue; - } - - //TODO to be optimized to add missing sub-ranges in one go: - boolean result = false; - for (int i = lowEnd; i <= highEnd; i++) - result = add(i) | result; - return result; - } - - /** - * Add an integer value to the set. - * @param value to add to the set. - * @return true if the set changed as a result of this invocation, false otherwise. - */ - public boolean add(final int value) { - int pos = Arrays.binarySearch(elements,0,size,value); - if (pos >= 0) return false; - int insertPos = - pos - 1; - ensureCapacity(size + 1); - System.arraycopy(elements, insertPos, elements, insertPos + 1, size - insertPos); - elements[insertPos] = value; - size++; - return true; - } - - /** - * Add a arbitrary number of integers to the set. - * - * @param values integer to add to the set. - * @return true if the set changed as a result of this invocation, false otherwise. - */ - public boolean addAll(final int ... values) { - ensureCapacity(size + values.length); - boolean result = false; - for (final int v : values) - result = add(v) | result; - return result; - } - - @Override - public boolean addAll(final Collection numbers) { - ensureCapacity(size + numbers.size()); - boolean result = false; - for (final Number n : numbers) - result = add(n.intValue()) | result; - return result; - } - - /** - * Add all values within a range in an integer array. - * - * @param source array where the values to add are found. - * @param fromIndex first position from source to add (inclusive). - * @param toIndex index after the last position in source to add (thus exclusive). - * @throws NullPointerException if source is null. - * @throws NegativeArraySizeException if fromIndex or toIndex are negative. - * @throws ArrayIndexOutOfBoundsException if fromIndex or toIndex are beyond bounds - * allowed [0 .. source.length]. - * @return true if the set changed as a result of this invocation, false otherwise. - */ - public boolean addAll(final int[] source, final int fromIndex, final int toIndex) { - ensureCapacity(size + source.length); - boolean result = false; - for (int i = fromIndex; i < toIndex; i++) - result = add(source[i]) | result; - return result; - } - - - /** - * Add all elements present in a int-set. - * - * @param other the other inset. - * - * @throws NullPointerException if other is null. - * @return true if this set changed due to this operation, false otherwise. - */ - public boolean addAll(final CountSet other) { - return addAll(other.elements,0,other.size); - } - - /** - * Checks whether a integer value is included in the set. - * @param value the value to check. - * @return true if value is inside the set, false otherwise. - */ - public boolean contains(final int value) { - return Arrays.binarySearch(elements,0,size,value) >= 0; - } - - /** - * Make sure that this int-set has capacity to handle a number of elements. - *

- * If the set has already that or greater capacity nothing would be changed. - * - * @param capacity the requested capacity. - */ - private void ensureCapacity(final int capacity) { - if (elements.length >= capacity) return; - int newLength = Math.max(elements.length << 1, capacity); - elements = Arrays.copyOf(elements,newLength); - } - - - @Override - public int size() { - return size; - } - - @Override - public boolean isEmpty() { - return size() == 0; - } - - @Override - public boolean contains(final Object o) { - if (o instanceof Integer) { - final int i = (Integer)o; - return contains(i); - } else - return false; //To change body of implemented methods use File | Settings | File Templates. - } - - @Override - @NotNull - public Iterator iterator() { - return new MyIterator(); - } - - @Override - @NotNull - public Object[] toArray() { - final Integer[] result = new Integer[size]; - for (int i = 0; i < size; i++) - result[i] = elements[i]; - return result; - } - - @Override - @NotNull - @SuppressWarnings("unchecked") - public T[] toArray(final T[] a) { - if (a == null) - throw new NullPointerException(); - - @SuppressWarnings("unchecked") - final Class componentClass = (Class) a.getClass().getComponentType(); - if (!componentClass.isAssignableFrom(Integer.class)) - throw new ArrayStoreException(); - - @SuppressWarnings("unchecked") - final T[] dest = (a.length < size) ? (T[]) (Object[]) Array.newInstance(componentClass, size) : a; - - for (int i = 0; i < size; i++) - dest[i] = (T) (Integer) elements[i]; - return dest; - } - - /** - * Copies the content of the set into an integer array. The result can be freely modified by the invoker. - * @return never null but a zero-length array if the set is empty. - */ - @NotNull - public int[] toIntArray() { - return Arrays.copyOfRange(elements,0,size); - } - - /** - * Copy the content of the set into an array. - * @param dest the destination array. - * @param offset where to store the first element of the set. - * @throws NullPointerException if dest is null. - * @throws ArrayIndexOutOfBoundsException if offset is out of range of there is not enough - * space after offset in the destination array to hold all values in the set. - */ - public void copyTo(final int[] dest, int offset) { - if (dest == null) - throw new NullPointerException(); - if (dest.length < (size + offset)) - throw new ArrayIndexOutOfBoundsException("destination is to short"); - System.arraycopy(elements,0,dest,offset,size); - } - - /** - * Copy the content of the set into an array. - * @param dest the destination array. - * @throws NullPointerException if dest is null. - * @throws ArrayIndexOutOfBoundsException if there is not enough - * space after offset in the destination array to hold all values in the set. - */ - public void copyTo(final int[] dest) { - copyTo(dest,0); - } - - - @Override - public boolean add(final Integer integer) { - return add((int) integer); - } - - @Override - public boolean remove(final Object o) { - return o instanceof Integer && remove((int)o); - } - - /** - * Removes a single integer value for the set. - * @param i the value to remove. - * @return true if the set has changed as a result of this invocation, false otherwise. - */ - public boolean remove(final int i) { - final int pos = Arrays.binarySearch(elements,0,size,i); - if (pos < 0) - return false; - else { - removeIndex(pos); - return true; - } - } - - @Override - public boolean containsAll(final Collection c) { - for (final Object o : c) - if (!contains(o)) - return false; - return true; - } - - - @Override - public boolean retainAll(final Collection c) { - if (size == 0) - return false; - @SuppressWarnings("all") - final CountSet retainIndices = new CountSet(c.size() + 2); - retainIndices.add(-1); - retainIndices.add(size); - for (final Object o : c) { - if (!(o instanceof Integer)) - continue; - final int pos = Arrays.binarySearch(elements,0,size,(int) o); - if (pos < 0) - continue; - retainIndices.add(pos); - } - if (retainIndices.size == 2) { - size = 0; - return true; - } else if (retainIndices.size == size + 2) { - return false; - } else { - for (int idx = retainIndices.size - 1; idx > 0; idx--) { - final int toIdx = retainIndices.elements[idx]; - final int fromIdx = retainIndices.elements[idx - 1] + 1; - removeIndices(toIdx,fromIdx); - } - return true; - } - } - - /** - * Removes the values found in a range of indexes in {@link #elements}. - * @param fromIdx first index to remove (inclusive). - * @param toIdx right after last index to remove (exclusive). - */ - @Requires("fromIdx >= toIdx & fromIdx >= 0 & toIdx <= size") - private void removeIndices(final int fromIdx, final int toIdx) { - System.arraycopy(elements,toIdx,elements,fromIdx,size - toIdx); - size -= toIdx - fromIdx; - } - - @Override - public boolean removeAll(final Collection c) { - boolean result = false; - for (final Object o : c) - result = remove(o) | result; - return result; - } - - @Requires("idx >= 0 && idx < size") - private void removeIndex(int idx) { - System.arraycopy(elements,idx+1,elements,idx,size - idx - 1); - } - - @Override - public void clear() { - size = 0; - } - - /** - * Returns a copy of this set which can be changed without modifying the original one. - * @return never {@code null}. - */ - @NotNull - @SuppressWarnings("all") - public CountSet clone() { - return new CountSet(this); - } - - @Override - public String toString() { - final StringBuilder sb = new StringBuilder(2 + size() * 10); - sb.append('{'); - for (int i = 0; i < size; i++) - sb.append(elements[i]).append(','); - sb.replace(sb.length()-1,sb.length(),"}"); - return sb.toString(); - - } - - - /** - * Custom iterator class for {@link CountSet IntSets} - */ - private class MyIterator implements Iterator { - /** What position I am in. */ - private int next = 0; - - @Override - public boolean hasNext() { - return next < size; - } - - @Override - public Integer next() { - if (next >= size) - throw new NoSuchElementException(); - return elements[next]; - } - - @Override - public void remove() { - if (next == 0) - throw new IllegalStateException(); - if (next >= size) - throw new NoSuchElementException(); - removeIndex(next - 1); - } - } - - -} diff --git a/protected/java/src/org/broadinstitute/sting/utils/gvcf/GVCFWriter.java b/protected/java/src/org/broadinstitute/sting/utils/gvcf/GVCFWriter.java deleted file mode 100644 index 98aedf786..000000000 --- a/protected/java/src/org/broadinstitute/sting/utils/gvcf/GVCFWriter.java +++ /dev/null @@ -1,302 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.utils.gvcf; - -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.ReferenceConfidenceModel; -import org.broadinstitute.sting.utils.variant.GATKVCFUtils; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.variantcontext.Genotype; -import org.broadinstitute.variant.variantcontext.GenotypeBuilder; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.variantcontext.VariantContextBuilder; -import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; -import org.broadinstitute.variant.vcf.*; - -import java.util.*; - -/** - * Genome-wide VCF writer - * - * User: depristo - * Date: 6/24/13 - * Time: 2:51 PM - */ -public class GVCFWriter implements VariantContextWriter { - // - // static VCF field names - // - protected final static String BLOCK_SIZE_INFO_FIELD = "BLOCK_SIZE"; - protected final static String MIN_DP_FORMAT_FIELD = "MIN_DP"; - protected final static String MIN_GQ_FORMAT_FIELD = "MIN_GQ"; - - // - // Final fields initialized in constructor - // - /** Where we'll ultimately write our VCF records */ - final private VariantContextWriter underlyingWriter; - - final private List GQPartitions; - - /** fields updated on the fly during GVCFWriter operation */ - int nextAvailableStart = -1; - private String sampleName = null; - private HomRefBlock currentBlock = null; - - /** - * Is the proposed GQ partitions well-formed? - * - * @param GQPartitions proposed GQ partitions - * @return a non-null string if something is wrong (string explains issue) - */ - protected static List parsePartitions(final List GQPartitions) { - if ( GQPartitions == null ) throw new IllegalArgumentException("GQpartitions cannot be null"); - if ( GQPartitions.isEmpty() ) throw new IllegalArgumentException("GQpartitions cannot be empty"); - - final List result = new LinkedList<>(); - int lastThreshold = 0; - for ( final Integer value : GQPartitions ) { - if ( value == null ) throw new IllegalArgumentException("GQPartitions contains a null integer"); - if ( value < lastThreshold ) throw new IllegalArgumentException("GQPartitions is out of order. Last is " + lastThreshold + " but next is " + value); - if ( value == lastThreshold ) throw new IllegalArgumentException("GQPartitions is equal elements: Last is " + lastThreshold + " but next is " + value); - result.add(new HomRefBlock(lastThreshold, value)); - lastThreshold = value; - } - result.add(new HomRefBlock(lastThreshold, Integer.MAX_VALUE)); - - return result; - } - - /** - * Create a new GVCF writer - * - * Should be a non-empty list of boundaries. For example, suppose this variable is - * - * [A, B, C] - * - * We would partition our hom-ref sites into the following bands: - * - * X < A - * A <= X < B - * B <= X < C - * X >= C - * - * @param underlyingWriter the ultimate destination of the GVCF records - * @param GQPartitions a well-formed list of GQ partitions - */ - public GVCFWriter(final VariantContextWriter underlyingWriter, final List GQPartitions) { - if ( underlyingWriter == null ) throw new IllegalArgumentException("underlyingWriter cannot be null"); - this.underlyingWriter = underlyingWriter; - this.GQPartitions = parsePartitions(GQPartitions); - } - - /** - * Write the VCF header - * - * Adds standard GVCF fields to the header - * - * @param header a non-null header - */ - @Override - public void writeHeader(VCFHeader header) { - if ( header == null ) throw new IllegalArgumentException("header cannot be null"); - header.addMetaDataLine(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY)); - header.addMetaDataLine(new VCFInfoHeaderLine(BLOCK_SIZE_INFO_FIELD, 1, VCFHeaderLineType.Integer, "Size of the homozygous reference GVCF block")); - header.addMetaDataLine(new VCFFormatHeaderLine(MIN_DP_FORMAT_FIELD, 1, VCFHeaderLineType.Integer, "Minimum DP observed within the GVCF block")); - header.addMetaDataLine(new VCFFormatHeaderLine(MIN_GQ_FORMAT_FIELD, 1, VCFHeaderLineType.Integer, "Minimum GQ observed within the GVCF block")); - - for ( final HomRefBlock partition : GQPartitions ) { - header.addMetaDataLine(partition.toVCFHeaderLine()); - } - - underlyingWriter.writeHeader(header); - } - - /** - * Close this GVCF writer. Finalizes any pending hom-ref blocks and emits those to the underlyingWriter as well - */ - @Override - public void close() { - close(true); - } - - /** - * Horrible work around because there's no clean way to get our VCFWriter closed by the GATK - * - * If closeUnderlyingWriter is true, then we'll close the underlying writer, otherwise we'll leave it open - * so the GATK closes it later - * - * @param closeUnderlyingWriter should we leave the underlying writer open or closed? - */ - public void close(final boolean closeUnderlyingWriter) { - emitCurrentBlock(); - if ( closeUnderlyingWriter ) underlyingWriter.close(); - } - - /** - * Add hom-ref site from vc to this gVCF hom-ref state tracking, emitting any pending states if appropriate - * - * @param vc a non-null VariantContext - * @param g a non-null genotype from VariantContext - * @return a VariantContext to be emitted, or null if non is appropriate - */ - protected VariantContext addHomRefSite(final VariantContext vc, final Genotype g) { - if ( nextAvailableStart != -1 && vc.getStart() <= nextAvailableStart ) { - // don't create blocks while the hom-ref site falls before nextAvailableStart (for deletions) - return null; - } else if ( currentBlock == null ) { - currentBlock = createNewBlock(vc, g); - return null; - } else if ( currentBlock.withinBounds(g.getGQ()) ) { - currentBlock.add(vc.getStart(), g); - return null; - } else { - final VariantContext result = blockToVCF(currentBlock); - currentBlock = createNewBlock(vc, g); - return result; - } - } - - /** - * Flush the current hom-ref block, if necessary, to the underlying writer, and reset the currentBlock to null - */ - private void emitCurrentBlock() { - if ( currentBlock != null ) { - // there's actually some work to do - underlyingWriter.add(blockToVCF(currentBlock)); - currentBlock = null; - } - } - - /** - * Convert a HomRefBlock into a VariantContext - * - * @param block the block to convert - * @return a VariantContext representing the gVCF encoding for this block - */ - private VariantContext blockToVCF(final HomRefBlock block) { - if ( block == null ) throw new IllegalArgumentException("block cannot be null"); - - final VariantContextBuilder vcb = new VariantContextBuilder(block.getStartingVC()); - vcb.attributes(new HashMap(2)); // clear the attributes - vcb.stop(block.getStop()); - vcb.attribute(VCFConstants.END_KEY, block.getStop()); - vcb.attribute(BLOCK_SIZE_INFO_FIELD, block.getSize()); - - // create the single Genotype with GQ and DP annotations - final GenotypeBuilder gb = new GenotypeBuilder(sampleName, Collections.nCopies(2, block.getRef())); - gb.noAD().noPL().noAttributes(); // clear all attributes - gb.GQ(block.getMedianGQ()); - gb.DP(block.getMedianDP()); - gb.attribute(MIN_DP_FORMAT_FIELD, block.getMinDP()); - gb.attribute(MIN_GQ_FORMAT_FIELD, block.getMinGQ()); - gb.PL(block.getMinPLs()); - - return vcb.genotypes(gb.make()).make(); - } - - /** - * Helper function to create a new HomRefBlock from a variant context and current genotype - * - * @param vc the VariantContext at the site where want to start the band - * @param g the genotype of the sample from vc that should be used to initialize the block - * @return a newly allocated and initialized block containing g already - */ - private HomRefBlock createNewBlock(final VariantContext vc, final Genotype g) { - // figure out the GQ limits to use based on the GQ of g - HomRefBlock partition = null; - for ( final HomRefBlock maybePartition : GQPartitions ) { - if ( maybePartition.withinBounds(g.getGQ()) ) { - partition = maybePartition; - break; - } - } - if ( partition == null ) throw new IllegalStateException("GQ " + g + " from " + vc + " didn't fit into any partition " + partition); - - // create the block, add g to it, and return it for use - final HomRefBlock block = new HomRefBlock(vc, partition.getGQLowerBound(), partition.getGQUpperBound()); - block.add(vc.getStart(), g); - return block; - } - - /** - * Add a VariantContext to this writer for emission - * - * Requires that the VC have exactly one genotype - * - * @param vc a non-null VariantContext - */ - @Override - public void add(VariantContext vc) { - if ( vc == null ) throw new IllegalArgumentException("vc cannot be null"); - - if ( sampleName == null ) - sampleName = vc.getGenotype(0).getSampleName(); - - if ( ! vc.hasGenotypes() ) { - throw new IllegalArgumentException("GVCF assumes that the VariantContext has genotypes"); - } else if ( vc.getGenotypes().size() != 1 ) { - throw new IllegalArgumentException("GVCF assumes that the VariantContext has exactly one genotype but saw " + vc.getGenotypes().size()); - } else { - if ( currentBlock != null && ! currentBlock.isContiguous(vc) ) { - // we've made a non-contiguous step (across interval, onto another chr), so finalize - emitCurrentBlock(); - } - - final Genotype g = vc.getGenotype(0); - if ( g.isHomRef() && vc.hasAlternateAllele(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE) ) { - // create bands - final VariantContext maybeCompletedBand = addHomRefSite(vc, g); - if ( maybeCompletedBand != null ) underlyingWriter.add(maybeCompletedBand); - } else { - // g is variant, so flush the bands and emit vc - emitCurrentBlock(); - nextAvailableStart = vc.getEnd(); - underlyingWriter.add(vc); - } - } - } -} diff --git a/protected/java/src/org/broadinstitute/sting/utils/gvcf/HomRefBlock.java b/protected/java/src/org/broadinstitute/sting/utils/gvcf/HomRefBlock.java deleted file mode 100644 index ebd167a31..000000000 --- a/protected/java/src/org/broadinstitute/sting/utils/gvcf/HomRefBlock.java +++ /dev/null @@ -1,186 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.utils.gvcf; - -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.variant.variantcontext.Allele; -import org.broadinstitute.variant.variantcontext.Genotype; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.vcf.VCFHeaderLine; - -import java.util.ArrayList; -import java.util.List; - -/** - * Helper class for calculating a GQ band in the GVCF writer - * - * A band contains GQ and DP values for a contiguous stretch of hom-ref genotypes, - * and provides summary information about the entire block of genotypes. - * - * Genotypes within the HomRefBlock are restricted to hom-ref genotypes within a band of GQ scores - * - * User: depristo - * Date: 6/25/13 - * Time: 9:41 AM - */ -final class HomRefBlock { - private final VariantContext startingVC; - private int stop; - private final int minGQ, maxGQ; - private int[] minPLs = null; - final private List GQs = new ArrayList<>(100); - final private List DPs = new ArrayList<>(100); - private final Allele ref; - - /** - * Create a new HomRefBlock - * - * @param startingVC the VariantContext that starts this band (for starting position information) - * @param minGQ the minGQ (inclusive) to use in this band - * @param maxGQ the maxGQ (exclusive) to use in this band - */ - public HomRefBlock(final VariantContext startingVC, int minGQ, int maxGQ) { - if ( startingVC == null ) throw new IllegalArgumentException("startingVC cannot be null"); - if ( minGQ > maxGQ ) throw new IllegalArgumentException("bad minGQ " + minGQ + " as its > maxGQ " + maxGQ); - - this.startingVC = startingVC; - this.stop = getStart() - 1; - this.ref = startingVC.getReference(); - this.minGQ = minGQ; - this.maxGQ = maxGQ; - } - - /** - * Create a new HomRefBlock only for doing bounds checking - * - * @param minGQ the minGQ (inclusive) to use in this band - * @param maxGQ the maxGQ (exclusive) to use in this band - */ - public HomRefBlock(int minGQ, int maxGQ) { - if ( minGQ > maxGQ ) throw new IllegalArgumentException("bad minGQ " + minGQ + " as its > maxGQ " + maxGQ); - - this.startingVC = null; - this.stop = -1; - this.ref = null; - this.minGQ = minGQ; - this.maxGQ = maxGQ; - } - - /** - * Add information from this Genotype to this band - * @param g a non-null Genotype with GQ and DP attributes - */ - public void add(final int pos, final Genotype g) { - if ( g == null ) throw new IllegalArgumentException("g cannot be null"); - if ( ! g.hasGQ() ) throw new IllegalArgumentException("g must have GQ field"); - if ( ! g.hasPL() ) throw new IllegalArgumentException("g must have PL field"); - if ( ! g.hasDP() ) throw new IllegalArgumentException("g must have DP field"); - if ( pos != stop + 1 ) throw new IllegalArgumentException("adding genotype at pos " + pos + " isn't contiguous with previous stop " + stop); - - if( minPLs == null ) { // if the minPLs vector has not been set yet, create it here by copying the provided genotype's PLs - final int[] PL = g.getPL(); - if( PL.length == 3 ) { - minPLs = PL.clone(); - } - } else { // otherwise take the min with the provided genotype's PLs - final int[] PL = g.getPL(); - if( PL.length == 3 ) { - minPLs[0] = Math.min(minPLs[0], PL[0]); - minPLs[1] = Math.min(minPLs[1], PL[1]); - minPLs[2] = Math.min(minPLs[2], PL[2]); - } - } - stop = pos; - GQs.add(Math.min(g.getGQ(), 99)); // cap the GQs by the max. of 99 emission - DPs.add(g.getDP()); - } - - /** - * Is the GQ value within the bounds of this GQ (GQ >= minGQ && GQ < maxGQ) - * @param GQ the GQ value to test - * @return true if within bounds, false otherwise - */ - public boolean withinBounds(final int GQ) { - return GQ >= minGQ && GQ < maxGQ; - } - - /** Get the min GQ observed within this band */ - public int getMinGQ() { return MathUtils.arrayMin(GQs); } - /** Get the median GQ observed within this band */ - public int getMedianGQ() { return MathUtils.median(GQs); } - /** Get the min DP observed within this band */ - public int getMinDP() { return MathUtils.arrayMin(DPs); } - /** Get the median DP observed within this band */ - public int getMedianDP() { return MathUtils.median(DPs); } - /** Get the min PLs observed within this band, can be null if no PLs have yet been observed */ - public int[] getMinPLs() { return minPLs; } - - protected int getGQUpperBound() { return maxGQ; } - protected int getGQLowerBound() { return minGQ; } - - public boolean isContiguous(final VariantContext vc) { - return vc.getEnd() == getStop() + 1 && startingVC.getChr().equals(vc.getChr()); - } - - public VariantContext getStartingVC() { return startingVC; } - public int getStart() { return startingVC.getStart(); } - public int getStop() { return stop; } - public Allele getRef() { return ref; } - public int getSize() { return getStop() - getStart() + 1; } - - @Override - public String toString() { - return "HomRefBlock{" + - "minGQ=" + minGQ + - ", maxGQ=" + maxGQ + - '}'; - } - - public VCFHeaderLine toVCFHeaderLine() { - return new VCFHeaderLine("GVCFBlock", "minGQ=" + getGQLowerBound() + "(inclusive),maxGQ=" + getGQUpperBound() + "(exclusive)"); - } -} diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/ArrayLoglessPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/ArrayLoglessPairHMM.java deleted file mode 100644 index a693ec22d..000000000 --- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/ArrayLoglessPairHMM.java +++ /dev/null @@ -1,450 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.utils.pairhmm; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.broadinstitute.sting.utils.QualityUtils; - -import java.util.Arrays; - -/** - * Created with IntelliJ IDEA. - * User: bradt - * Date: 6/11/13 - */ -public class ArrayLoglessPairHMM extends PairHMM { - private static final double INITIAL_CONDITION = Math.pow(2, 1020); - private static final double INITIAL_CONDITION_LOG10 = Math.log10(INITIAL_CONDITION); - - // we divide e by 3 because the observed base could have come from any of the non-observed alleles - protected static final double TRISTATE_CORRECTION = 3.0; - - private static final int matchToMatch = 0; - private static final int indelToMatch = 1; - private static final int matchToInsertion = 2; - private static final int insertionToInsertion = 3; - private static final int matchToDeletion = 4; - private static final int deletionToDeletion = 5; - - protected double[][] transition = null; // The transition probabilities cache - protected double[][] prior = null; // The prior probabilities cache - - // Array declarations for arrays implementation - private double[] currentMatchArray = null; - private double[] currentDeleteArray = null; - private double[] currentInsertArray = null; - private double[] parentMatchArray = null; - private double[] parentDeleteArray = null; - private double[] parentInsertArray = null; - private double[] grandparentMatchArray = null; - private double[] grandparentDeleteArray = null; - private double[] grandparentInsertArray = null; - - // When successive haplotypes have a common prefix, these arrays store cached info from the previous haplotype; for reading - private double[] matchCacheArray = null; - private double[] deleteCacheArray = null; - private double[] insertCacheArray = null; - - // These arrays store cache info for use with the next haplotype; for writing - private double[] nextMatchCacheArray = null; - private double[] nextDeleteCacheArray = null; - private double[] nextInsertCacheArray = null; - - // Used when caching to store our intermediate sum at point of first difference bw successive haplotypes - private double partialSum; - - - /** - * {@inheritDoc} - */ - @Override - public void initialize(final int readMaxLength, final int haplotypeMaxLength ) { - super.initialize(readMaxLength, haplotypeMaxLength); - - transition = new double[paddedMaxReadLength][6]; - prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; - - // Initialize all arrays - // Final Cell of array is a padding cell, initialized to zero. - currentMatchArray = new double[paddedMaxReadLength]; - currentDeleteArray = new double[paddedMaxReadLength]; - currentInsertArray = new double[paddedMaxReadLength]; - - parentMatchArray = new double[paddedMaxReadLength]; - parentDeleteArray = new double[paddedMaxReadLength]; - parentInsertArray = new double[paddedMaxReadLength]; - - grandparentMatchArray = new double[paddedMaxReadLength]; - grandparentDeleteArray = new double[paddedMaxReadLength]; - grandparentInsertArray = new double[paddedMaxReadLength]; - - // Initialize the special arrays used for caching when successive haplotypes have a common prefix - matchCacheArray = new double[paddedMaxReadLength]; - deleteCacheArray = new double[paddedMaxReadLength]; - insertCacheArray = new double[paddedMaxReadLength]; - - nextMatchCacheArray = new double[paddedMaxReadLength]; - nextDeleteCacheArray = new double[paddedMaxReadLength]; - nextInsertCacheArray = new double [paddedMaxReadLength]; - - } - - - /** - * {@inheritDoc} - */ - @Override - public double subComputeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, - final byte[] readBases, - final byte[] readQuals, - final byte[] insertionGOP, - final byte[] deletionGOP, - final byte[] overallGCP, - int hapStartIndex, - final boolean recacheReadValues, - final int nextHapStartIndex) { - - if ( ! constantsAreInitialized) { - initializeProbabilities(transition, insertionGOP, deletionGOP, overallGCP); - - // note that we initialized the constants - constantsAreInitialized = true; - } - initializePriors(haplotypeBases, readBases, readQuals, hapStartIndex); - - // Some housekeeping to be done if we are starting a new read - if (recacheReadValues) { - hapStartIndex = 0; - - initializeProbabilities(transition, insertionGOP, deletionGOP, overallGCP); - // note that we initialized the constants - constantsAreInitialized = true; - - // Read length may have changed, so we need to set zero-value padding at the appropriate position. - padMatchAndInsertArrays(readBases.length); - } - - // if we have not cached from a previous haplotype, clear any info we may have accumulated in a previous HMM iteration - if (hapStartIndex == 0) { - clearPreviouslyCachedInfo(readBases.length); - - // Haplotype length may have changed, so we need to set initial-value padding at the appropriate position. - padDeleteArrays(haplotypeBases.length, readBases.length); - } - - // We build up our solution by looking at position [0] in the match, insert arrays. Need to set these to 0 before we start. - clearArraySolutionPosition(); - - // Some parameters to control behavior during the dynamic programming loop - final int maxDiagonals = readBases.length + haplotypeBases.length - hapStartIndex - 1; // Number of diagonals for a matrix = rows + cols - 1; - int startFill; // The lower bound of the array indices we want to over-write - int endFill; // The upper bound of the array indices we want to over-write - final int cacheSumIndex = nextHapStartIndex - hapStartIndex + readBases.length - 1; // This array will contain the partial sum to cache for the next haplotype - double finalArraySumProbabilities = partialSum; // The final answer prior to log10 correction - - // Perform dynamic programming using arrays, as if over diagonals of a hypothetical read/haplotype alignment matrix - for (int i = 1; i <= maxDiagonals; i++) { - // set the bounds for cells we wish to fill in the arrays - startFill = Math.max(readBases.length - i, 0); - endFill = Math.min(maxDiagonals - i + 1, readBases.length); - - // apply any previously cached array information - if (i <= readBases.length) - applyPreviouslyCachedInfo(startFill); - - // fill in the cells for our current arrays - updateArrays(readBases.length, hapStartIndex, nextHapStartIndex, startFill, endFill, i); - - // final probability is the log10 sum of the last element in the Match and Insertion state arrays - // this way we ignore all paths that ended in deletions! (huge) - // but we have to sum all the paths ending in the M and I arrays, because they're no longer extended. - // Where i > readBases.length, array[0] corresponds to bottom row of a [read] x [haplotype] matrix. Before this, they carries the 0's we set above. - finalArraySumProbabilities += currentInsertArray[0] + currentMatchArray[0]; - - // Partial sum for caching the next haplotype: - // At the position of the last similar base between this haplotype and the next one... - // ...remember the partial sum, so that we can start here on the next hap. - if (i == cacheSumIndex) - partialSum = finalArraySumProbabilities; - - rotateArrayReferences(); - } - // The cache arrays we wrote for this haplotype will be read for the next haplotype. - rotateCacheArrays(); - - //return result - return Math.log10(finalArraySumProbabilities) - INITIAL_CONDITION_LOG10; - } - - /** - * Initializes the matrix that holds all the constants related to the editing - * distance between the read and the haplotype. - * - * @param haplotypeBases the bases of the haplotype - * @param readBases the bases of the read - * @param readQuals the base quality scores of the read - * @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read) - */ - public void initializePriors(final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, final int startIndex) { - - // initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases - // Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2. - - for (int i = 0; i < readBases.length; i++) { - final byte x = readBases[i]; - final byte qual = readQuals[i]; - for (int j = startIndex; j < haplotypeBases.length; j++) { - final byte y = haplotypeBases[j]; - prior[i+1][j+1] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? - QualityUtils.qualToProb(qual) : (QualityUtils.qualToErrorProb(qual) / (doNotUseTristateCorrection ? 1.0 : TRISTATE_CORRECTION)) ); - } - } - } - - /** - * Initializes the matrix that holds all the constants related to quality scores. - * - * @param insertionGOP insertion quality scores of the read - * @param deletionGOP deletion quality scores of the read - * @param overallGCP overall gap continuation penalty - */ - @Requires({ - "insertionGOP != null", - "deletionGOP != null", - "overallGCP != null" - }) - @Ensures("constantsAreInitialized") - protected static void initializeProbabilities(final double[][] transition, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP) { - for (int i = 0; i < insertionGOP.length; i++) { - final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE); - transition[i+1][matchToMatch] = QualityUtils.qualToProb((byte) qualIndexGOP); - transition[i+1][indelToMatch] = QualityUtils.qualToProb(overallGCP[i]); - transition[i+1][matchToInsertion] = QualityUtils.qualToErrorProb(insertionGOP[i]); - transition[i+1][insertionToInsertion] = QualityUtils.qualToErrorProb(overallGCP[i]); - transition[i+1][matchToDeletion] = QualityUtils.qualToErrorProb(deletionGOP[i]); - transition[i+1][deletionToDeletion] = QualityUtils.qualToErrorProb(overallGCP[i]); - } - } - - /** - * Pad the ends of the Match and Insert arrays with 0. - * Analogous to setting zeros in the first row in the Match, Insert matrices of N2MemoryPairHMM. - * - * @param padPosition Which index in the arrays we wish to pad - */ - private void padMatchAndInsertArrays(final int padPosition) { - grandparentMatchArray[padPosition] = 0; - grandparentInsertArray[padPosition] = 0; - parentMatchArray[padPosition] = 0; - parentInsertArray[padPosition] = 0; - currentMatchArray[padPosition] = 0; - currentInsertArray[padPosition] = 0; - matchCacheArray[padPosition] = 0; - insertCacheArray[padPosition] = 0; - nextMatchCacheArray[padPosition] = 0; - nextInsertCacheArray[padPosition] = 0; - } - - /** - * Pad the Delete arrays with an intial value. Let's us have free deletions at the beginning of the alignment. - * Analogous to padding the first row of the Delete matrix of N2MemoryPairHMM. - * - * @param haplotypeLength The length of the present haplotype. Necessary for calculating initial padding value - * @param padPosition Which index in the arrays we wish to pad - */ - private void padDeleteArrays(final int haplotypeLength, final int padPosition) { - final double initialValue = INITIAL_CONDITION / haplotypeLength; - - // Pad the deletion arrays. Akin to padding the first row in the deletion matrix - parentDeleteArray[padPosition] = initialValue; - grandparentDeleteArray[padPosition] = initialValue; - currentDeleteArray[padPosition] = initialValue; - deleteCacheArray[padPosition] = initialValue; - nextDeleteCacheArray[padPosition] = initialValue; - } - - /** - * We build up our solution by looking at position [0] in the match, insert arrays. Need to set these to 0 before we start. - * - */ - private void clearArraySolutionPosition() { - grandparentMatchArray[0] = 0; - grandparentInsertArray[0] = 0; - parentMatchArray[0] = 0; - parentInsertArray[0] = 0; - currentMatchArray[0] = 0; - currentInsertArray[0] = 0; - } - - /** - * Clears cached information saved from the last haplotype, - * allowing us to start at the beginning of the present haplotype with intitial values of 0. - * - * @param fillLength How much of the cache arrays do we need to zero - */ - private void clearPreviouslyCachedInfo(final int fillLength) { - Arrays.fill(matchCacheArray, 0, fillLength, 0); - Arrays.fill(deleteCacheArray, 0, fillLength, 0); - Arrays.fill(insertCacheArray, 0, fillLength, 0); - - partialSum = 0; - } - - /** - * Applies cached information saved from the last haplotype, - * allowing us to start in the middle of the present haplotype. - * - * @param indK the index in the arrays we wish to update with cached info - */ - private void applyPreviouslyCachedInfo(int indK) { - // apply caching info necessary for calculating current DELETE array values - parentMatchArray[indK] = matchCacheArray[indK]; - parentDeleteArray[indK] = deleteCacheArray[indK]; - - // apply caching info necessary for calculating current MATCH array values - grandparentMatchArray[indK + 1] = matchCacheArray[indK + 1]; - grandparentDeleteArray[indK + 1] = deleteCacheArray[indK + 1]; - grandparentInsertArray[indK + 1] = insertCacheArray[indK + 1]; - } - - /** - * Records the mid-process state of one location in the read/haplotype alignment. - * Writes new cache information for use with the next haplotype we see. - * - * @param indK the index in the cache arrays we wish to store information in - */ - private void recordNewCacheInfo(int indK) { - nextMatchCacheArray[indK] = currentMatchArray[indK]; - nextDeleteCacheArray[indK] = currentDeleteArray[indK]; - nextInsertCacheArray[indK] = currentInsertArray[indK]; - } - - /** - * Update the HMM arrays for the current diagonal. - * - * @param readLength The length of the read - * @param hapStartIndex An offset that tells us if we are starting in the middle of the present haplotype - * @param nextHapStartIndex An offset that tells us which base in the NEXT haplotype we need to look at to record new caching info - * @param startFill The lower bound of the array indices we want to over-write - * @param endFill The upper bound of the array indices we want to over-write - * @param iii The index indicating which diagonal of the read/haplotype alignment we are working on - */ - private void updateArrays(final int readLength, - final int hapStartIndex, - final int nextHapStartIndex, - final int startFill, - final int endFill, - final int iii) { - - // The coordinate in our priors and transition matrices corresponding to a given position in the read/haplotype alignment - int matrixRow; - int matrixCol; - - int arrayIndex; - for (arrayIndex = startFill; arrayIndex < endFill; arrayIndex++) { - // translate the array position into a row, column in the priors and transition matrices - matrixRow = readLength - arrayIndex - 1; - matrixCol = iii - matrixRow - 1 + hapStartIndex; - - // update cell for each of our current arrays. Prior, transition matrices are padded +1 row,col - updateArrayCell(arrayIndex, prior[matrixRow+1][matrixCol+1], transition[matrixRow+1]); - - // Set up caching for the next haplotype - // At the position of the final similar base between this haplotype and the next one, remember the mid-array values - if (matrixCol == nextHapStartIndex - 1) - recordNewCacheInfo(arrayIndex); - } - } - - /** - * Updates a cell in the HMM arrays - * - * @param indK index in the arrays to update - * @param prior the likelihood editing distance matrix for the read x haplotype - * @param transition an array with the six transition relevant to this location - */ - private void updateArrayCell( final int indK, final double prior, final double[] transition) { - currentMatchArray[indK] = prior * ( grandparentMatchArray[indK + 1] * transition[matchToMatch] + - grandparentInsertArray[indK + 1] * transition[indelToMatch] + - grandparentDeleteArray[indK + 1] * transition[indelToMatch] ); - currentInsertArray[indK] = parentMatchArray[indK + 1] * transition[matchToInsertion] + parentInsertArray[indK + 1] * transition[insertionToInsertion]; - currentDeleteArray[indK] = parentMatchArray[indK] * transition[matchToDeletion] + parentDeleteArray[indK] * transition[deletionToDeletion]; - } - - /** - * To prepare for the next diagonal in our loop, each array must be bumped to an older generation - * - */ - private void rotateArrayReferences() { - double[] tempMatchArray = grandparentMatchArray; - double[] tempDeleteArray = grandparentDeleteArray; - double[] tempInsertArray = grandparentInsertArray; - - grandparentMatchArray = parentMatchArray; - grandparentDeleteArray = parentDeleteArray; - grandparentInsertArray = parentInsertArray; - - parentMatchArray = currentMatchArray; - parentDeleteArray = currentDeleteArray; - parentInsertArray = currentInsertArray; - - currentMatchArray = tempMatchArray; - currentDeleteArray = tempDeleteArray; - currentInsertArray = tempInsertArray; - } - - /** - * To prepare for the next haplotype, the caching info we wrote is copied into the cach-read arrays - * - */ - private void rotateCacheArrays() { - matchCacheArray = nextMatchCacheArray.clone(); - deleteCacheArray = nextDeleteCacheArray.clone(); - insertCacheArray = nextInsertCacheArray.clone(); - } -} diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/FastLoglessPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/FastLoglessPairHMM.java deleted file mode 100644 index fb9dda8b2..000000000 --- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/FastLoglessPairHMM.java +++ /dev/null @@ -1,820 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.utils.pairhmm; - -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.PairHMMLikelihoodCalculationEngine; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; - -/** - * Fast partial PairHMM backed on the standard Logless PairHMM - * - */ -public class FastLoglessPairHMM extends LoglessPairHMM implements FlexibleHMM { - - - /** - * Initial read length capacity. - */ - private static final int INITIAL_READ_LENGTH_CAPACITY = 200; - - /** - * Initial haplotype length capacity. - */ - private static final int INITIAL_HAPLOTYPE_LENGTH_CAPACITY = 400; - - - /** - * Holds the current read capacity. - *

It can only go up overtime.

- */ - private int readCapacity = INITIAL_READ_LENGTH_CAPACITY; - - /** - * Holds the current haplotype length capacity. - *

It can only go up overtime.

- */ - private int haplotypeCapacity = INITIAL_HAPLOTYPE_LENGTH_CAPACITY; - - private int maxToCol; - private int haplotypeLength; - - /** - * Returns the currently loaded read base qualities. - * - * @throws IllegalStateException if no read was previously loaded using {@link #loadRead}. - * @return never {@code null}. - */ - public byte[] getReadQuals() { - if (readQuals == null) - throw new IllegalStateException("no read was loaded onto the pairhmm calculator"); - return readQuals; - } - - /** - * Returns the currently loaded read insertion qualities. - * - * @throws IllegalStateException if no read was previously loaded using {@link #loadRead}. - * @return never {@code null}. - */ - @SuppressWarnings("unused") - public byte[] getReadInsQuals() { - if (readQuals == null) - throw new IllegalStateException("no read was loaded onto the pairhmm calculator"); - return readInsQuals; - } - - /** - * Returns the currently loaded read deletion qualities. - * - * @throws IllegalStateException if no read was previously loaded using {@link #loadRead}. - * @return never {@code null}. - */ - @SuppressWarnings("unused") - public byte[] getReadDelQuals() { - if (readQuals == null) - throw new IllegalStateException("no read was loaded onto the pairhmm calculator"); - return readDelQuals; - } - - /** - * Returns the currently loaded read gap extension penalty.. - * - * @throws IllegalStateException if no read was previously loaded using {@link #loadRead}. - * @return never {@code null}. - */ - @SuppressWarnings("unused") - public byte[] getReadGepQuals() { - if (readQuals == null) - throw new IllegalStateException("no read was loaded onto the pairhmm calculator"); - return readGepQuals; - } - - - /** - * Creates a new pair-hmm calculator instance give the gap continuation penalty. - * - * @param gcp the gap-continuation penalty. - */ - public FastLoglessPairHMM(final byte gcp) { - constantGCP = gcp; - initialize(readCapacity,haplotypeCapacity); - } - - @Override - public byte getGapExtensionPenalty() { - return constantGCP; - } - - - @Override - public double subComputeReadLikelihoodGivenHaplotypeLog10(final byte[] haplotypeBases, - final byte[] readBases, - final byte[] readQuals, - final byte[] insertionGOP, - final byte[] deletionGOP, - final byte[] overallGCP, - final int hapStartIndex, - final boolean recacheReadValues, final int nextHapStartIndex) { - this.readBases = readBases; - this.haplotypeBases = haplotypeBases; - this.haplotypeLength = haplotypeBases.length; - return super.subComputeReadLikelihoodGivenHaplotypeLog10(haplotypeBases,readBases,readQuals, - insertionGOP,deletionGOP,overallGCP,hapStartIndex,recacheReadValues,nextHapStartIndex); - } - - /** - * Implement the last step summation to calculate the total likelihood. - * - * @param row number of the last row of the pair-hmm where the likelihood values are present. - * @param fromCol inclusive first column to include in the summation. - * @param toCol exclusive last column to include in the summation. - * @return 0 or less. - */ - protected double finalLikelihoodCalculation(final int row, - final int fromCol, final int toCol) { - - final double divider = Math.max(1,2 *(toCol - fromCol)); - final double dividerInverse = 1.0 / divider; - double finalLikelihood = 0; - - for (int j = fromCol; j < toCol; j++) { - finalLikelihood += matchMatrix[row][j] * dividerInverse; - finalLikelihood += insertionMatrix[row][j] * dividerInverse; - } - return StrictMath.log10(finalLikelihood) - INITIAL_CONDITION_LOG10 + StrictMath.log10(divider); - } - - /** - * Initialize the matrix values for a problem including the trailing end of the read. - * - *

- * Notice that you can improve performance by omitting filling reusable values from - * previous haplotype calculations. You can set {@code haplotypeStartOffset} to skill - * those columns. - *

- * - * @param readStart inclusive first position of the read used in the calculations. - * @param readEnd exclusive last position of the read considered in the calculations. - * @param haplotypeStartOffset offset of the haplotype right after the reusable prefix - * from previous calls. - * - * - */ - protected void initializeMatrixValuesForTrailingProblem(final int readStart, final int readEnd, - final int haplotypeStartOffset) { - - @SuppressWarnings("all") - final int zeroRow = readStart; - final int toRow = readEnd + 1; - final int toCol = haplotypeLength + 1; - - // fill first row with -Inf fot M and I but not for Deletion if leading - // to allow for free deletions at the beginning. - if (readStart == 0) { - // First row initialization: - Arrays.fill(matchMatrix[zeroRow],haplotypeStartOffset,toCol,0); - Arrays.fill(deletionMatrix[zeroRow],haplotypeStartOffset,toCol,INITIAL_CONDITION); - - if (haplotypeStartOffset == 0) - for (int i = zeroRow + 1; i < toRow; i++) - insertionMatrix[i][0] = matchMatrix[i][0] = deletionMatrix[i][0] = 0; - - } else { - Arrays.fill(matchMatrix[zeroRow], Math.max(1,haplotypeStartOffset), toCol,0); - Arrays.fill(insertionMatrix[zeroRow], haplotypeStartOffset, toCol,0); - if (haplotypeStartOffset == 0) { - matchMatrix[zeroRow][0] = INITIAL_CONDITION; - deletionMatrix[zeroRow][0] = 0; - } - if (haplotypeStartOffset <= 1) deletionMatrix[zeroRow][1] = matchMatrix[zeroRow][1] * transition[zeroRow][matchToDeletion]; - for (int i = Math.max(haplotypeStartOffset,2); i < toCol; i++) { - deletionMatrix[zeroRow][i] = deletionMatrix[zeroRow][i - 1] - * transition[zeroRow][deletionToDeletion]; - } - - if (haplotypeStartOffset == 0) { - matchMatrix[zeroRow + 1][0] = deletionMatrix[zeroRow + 1][0] = 0; - insertionMatrix[zeroRow + 1][0] = matchMatrix[zeroRow][0] * transition[zeroRow + 1][matchToInsertion]; - - - for (int i = zeroRow + 2; i < toRow; i++) { - matchMatrix[i][0] = deletionMatrix[i][0] = 0; - insertionMatrix[i][0] = insertionMatrix[i - 1][0] - * transition[i][insertionToInsertion]; - } - } - } - } - - /** - * Initializes calculation matrices give the characteristics of the next and previous problems. - * @param currentProblem reference to the Lk calculation problem we are dealing currently. - * @param previousProblem reference to the Lk calculation problem that has been solved just before. - * - */ - protected void initializeMatrixValues(final Problem currentProblem, final Problem previousProblem) { - if (previousProblem != null && - previousProblem.readStart == currentProblem.readStart && - previousProblem.hapStart == currentProblem.hapStart && - maxToCol >= currentProblem.hapEnd + 1) - return; - - final int zeroRow = currentProblem.readStart; - final int zeroCol = currentProblem.hapStart; - final int toRow = currentProblem.readEnd + 1; - final int toCol = currentProblem.hapEnd + 1; - maxToCol = toCol; - - // fill first row with -Inf fot M and I but not for Deletion if leading - // to allow for free deletions at the beginning. - if (currentProblem.leading) { - // First row initialization: - Arrays.fill(matchMatrix[zeroRow],zeroCol,toCol,0); - Arrays.fill(deletionMatrix[zeroRow],zeroCol,toCol,INITIAL_CONDITION); - - for (int i = zeroRow + 1; i < toRow; i++) - insertionMatrix[i][zeroCol] = matchMatrix[i][zeroCol] = deletionMatrix[i][zeroCol] = 0; - - } else { // If not leading set the appropriate matching 1.0 prob and - // deletion + extension. - - Arrays.fill(matchMatrix[zeroRow], zeroCol + 1, toCol,0); - Arrays.fill(insertionMatrix[zeroRow], zeroCol, toCol,0); - matchMatrix[zeroRow][zeroCol] = INITIAL_CONDITION; - deletionMatrix[zeroRow][zeroCol] = 0; - deletionMatrix[zeroRow][zeroCol + 1] = matchMatrix[zeroRow][zeroCol] * transition[zeroRow][matchToDeletion]; - for (int i = zeroCol + 2; i < toCol; i++) { - deletionMatrix[zeroRow][i] = deletionMatrix[zeroRow][i - 1] - * transition[zeroRow][deletionToDeletion]; - } - - matchMatrix[zeroRow + 1][zeroCol] = deletionMatrix[zeroRow + 1][zeroCol] = 0; - insertionMatrix[zeroRow + 1][zeroCol] = matchMatrix[zeroRow][zeroCol] * transition[zeroRow + 1][matchToInsertion]; - - for (int i = zeroRow + 2; i < toRow; i++) { - matchMatrix[i][zeroCol] = deletionMatrix[i][zeroCol] = 0; - insertionMatrix[i][zeroCol] = insertionMatrix[i - 1][zeroCol] - * transition[i][insertionToInsertion]; - } - } - } - - /** - * Constant gap-continuation-penalty. - */ - private final byte constantGCP; - - /** - * Currently loaded haplotype base sequence. - */ - private byte[] haplotypeBases; - - /** - * Currently loaded read base sequence. - */ - private byte[] readBases; - - /** - * Read qualities. - */ - private byte[] readQuals; - - /** - * Read insertion qualities. - */ - private byte[] readInsQuals; - - /** - * Read deletion qualities. - */ - private byte[] readDelQuals; - - /** - * Read gap-extension-penalties. - */ - private byte[] readGepQuals; - - /** - * Cached results. - */ - private Map cachedResults = new HashMap<>(); - - /** - * Loads the read that is going to be evaluated in following calls to {@link #calculateLocalLikelihoods}. - * - * @param read the target read. - * @throws NullPointerException if {@code read} is null. - */ - @Override - public void loadRead(final GATKSAMRecord read) { - loadRead(read.getReadBases(),read.getBaseQualities(),read.getBaseInsertionQualities(),read.getBaseDeletionQualities(),read.getMappingQuality()); - } - - /** - * Loads the read that is going to be evaluated in following calls to {@link #calculateLocalLikelihoods}. - * - * @param readBases the read bases. - * @param readQuals the read base call quality scores. - * @param readInsQuals the read insertion quality scores. - * @param readDelQuals the read deletion quality scores. - * @param mq the read mapping quality score. - * @throws NullPointerException if any of the arrays passed is {@code null}. - * @throws IllegalArgumentException if the arrays passed have incompatible sizes. - */ - public void loadRead(final byte[] readBases, final byte[] readQuals, final byte[] readInsQuals, final byte[] readDelQuals, int mq) { - // TODO This is a copy&paste from PairHMM*Engine read data preparation code. - // TODO It is simply to difficult to share the code without changing that class and I don't want - // TODO to do so for now. - if (readBases.length != readQuals.length) throw new IllegalArgumentException("the read quality array length does not match the read base array length"); - if (readBases.length != readInsQuals.length) throw new IllegalArgumentException("the read insert quality array length does not match the read base array length"); - if (readBases.length != readDelQuals.length) throw new IllegalArgumentException("the read deletion quality length does not match the read base array length"); - maxToCol = 0; - - if (readBases.length > readCapacity) { - readCapacity = readBases.length; - initialize(readCapacity,haplotypeCapacity); - } - paddedReadLength = readBases.length + 1; - final byte[] overallGCP = new byte[readBases.length]; - Arrays.fill(overallGCP, constantGCP); // Is there a way to derive - - for (int kkk = 0; kkk < readQuals.length; kkk++) { - readQuals[kkk] = (byte) Math.min(0xff & readQuals[kkk], - mq); // cap base quality by mapping - readQuals[kkk] = (byte) (readQuals[kkk] < PairHMMLikelihoodCalculationEngine.BASE_QUALITY_SCORE_THRESHOLD ? QualityUtils.MIN_USABLE_Q_SCORE - : Math.max(QualityUtils.MIN_USABLE_Q_SCORE,readQuals[kkk])); - readInsQuals[kkk] = (byte) Math.max(QualityUtils.MIN_USABLE_Q_SCORE,readInsQuals[kkk]); - readDelQuals[kkk] = (byte) Math.max(QualityUtils.MIN_USABLE_Q_SCORE,readDelQuals[kkk]); - } - this.readBases = readBases; - this.readQuals = readQuals; - this.readInsQuals = readInsQuals; - this.readDelQuals = readDelQuals; - this.readGepQuals = overallGCP; - initializeProbabilities(transition,readInsQuals, readDelQuals, overallGCP); - if (haplotypeBases != null) - fillPriorsTable(0); - cachedResults.clear(); - } - - @Override - public void loadHaplotypeBases(final byte[] haplotypeBases) { - if (readBases == null) - throw new IllegalStateException( - "no read was loaded before the haplotype"); - this.haplotypeBases = haplotypeBases.clone(); - haplotypeLength = haplotypeBases.length; - paddedHaplotypeLength = haplotypeLength; - if (haplotypeCapacity < haplotypeLength) { - haplotypeCapacity = haplotypeLength; - initialize(readCapacity,haplotypeCapacity); - initializeProbabilities(transition, readInsQuals, readDelQuals, readGepQuals); - } - initializePriors(this.haplotypeBases, readBases, readQuals, 0); - } - - - /** - * Changes only the suffix of the currently loaded haplotype. - * - *

- * If from is 0, this is equivalent to call {@link #loadHaplotypeBases(byte[])} directly. - *

- * @param from first position on the current haplotype to substitute with the new suffix. - * It can be up to the length of the haplotype in such case this operation is in - * effect just extending that haplotype. - * @param suffix the new bases for the end part of the current haplotype. - * @param suffixFrom inclusive first position of the actual suffix within the {@code suffix} array. - * @param suffixTo exclusive last position of the actual suffix within the {@code suffix} array. - * - * @throws IllegalStateException if no read was loaded with {@link #loadRead}. - * @throws IllegalArgumentException if from is more than 0 but no haplotype was loaded previously or if indices passed are inconsistent. - * @throws ArrayIndexOutOfBoundsException if indices passed are outside valid ranges. - */ - public void changeHaplotypeSuffix(final int from, final byte[] suffix, final int suffixFrom, final int suffixTo) { - if (readBases == null) - throw new IllegalStateException( - "no read was loaded before the haplotype"); - if (haplotypeBases == null && from > 0) - throw new IllegalArgumentException("from cannot be larger than 0 if no haplotype bases was previously loaded"); - if (suffixFrom < 0) - throw new ArrayIndexOutOfBoundsException("the suffix from index cannot be negative"); - if (suffixTo > suffix.length) - throw new ArrayIndexOutOfBoundsException("the suffix to index cannot be larger than the suffix array length"); - if (suffixFrom > suffixTo) - throw new IllegalArgumentException("the suffix to index cannot be smaller than the suffix from index"); - if (from > haplotypeLength) - throw new IllegalArgumentException("the from index cannot be greater than the current haplotype length"); - if (from < 0) - throw new IllegalArgumentException("the from index cannot be negative"); - - int startIndex = from; - if (haplotypeBases == null) { - haplotypeBases = Arrays.copyOfRange(suffix,suffixFrom,suffixTo); - haplotypeLength = suffixTo - suffixFrom; - } else { - final int newLength = from + suffixTo - suffixFrom; - if (haplotypeBases.length < newLength) - haplotypeBases = Arrays.copyOf(haplotypeBases,newLength); - System.arraycopy(suffix,suffixFrom,haplotypeBases,from,newLength - from); - haplotypeLength = newLength; - } - paddedHaplotypeLength = haplotypeLength + 1; - if (haplotypeCapacity < haplotypeLength) { - haplotypeCapacity = haplotypeLength; - initialize(readCapacity,haplotypeCapacity); - initializeProbabilities(transition, readInsQuals, readDelQuals, readGepQuals); - startIndex = 0; - } - //startIndex = 0; - fillPriorsTable(startIndex); - } - - /** - * Returns the bases of the current haplotype. - * - * @throws IllegalStateException if no haplotype was loaded previously - * @return never {@code null} - */ - public byte[] getHaplotypeBases() { - if (haplotypeBases == null) - throw new IllegalStateException(); - return Arrays.copyOfRange(haplotypeBases,0,haplotypeLength); - } - - /** - * Returns a debug representation of the pair-hmm. - * @return never {@code null}. - */ - public String toString() { - return "" + haplotypeLength + ":" + new String(Arrays.copyOfRange(haplotypeBases,0,haplotypeLength)); - } - - @Override - protected void initializePriors(final byte[] hapBases, final byte[] readBases, final byte[] baseQuals, final int idx) { - haplotypeBases = hapBases; - haplotypeLength = haplotypeBases.length; - this.readBases = readBases; - this.readQuals = baseQuals; - fillPriorsTable(idx); - } - - /** - * Fills the prior table up. - * - *

- * It accepts an argument to save unnecessary prefix filling up. - *

- * - * @param idx first position in the haplotype to start filling from. - */ - protected void fillPriorsTable(final int idx) { - for (int i = 0; i < readBases.length; i++) { - final byte x = readBases[i]; - final byte qual = readQuals[i]; - for (int j = idx; j < haplotypeLength; j++) { - final byte y = haplotypeBases[j]; - prior[i+1][j+1] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? - QualityUtils.qualToProb(qual) : (QualityUtils.qualToErrorProb(qual) / (doNotUseTristateCorrection ? 1.0 : TRISTATE_CORRECTION)) ); - } - } - } - - - /** - * Decorates haplotype set with their likelihoods as compared with the currently loaded read. - * - * - * @param readStart inclusive start position of the targeted section of the read. - * @param readEnd exclusive end position just beyond the targeted section of the read. - * @param haplotypes in/out set of haplotypes. - */ - public void calculateLocalLikelihoods(final int readStart, final int readEnd, final PairHMMReadyHaplotypes haplotypes) { - final PairHMMReadyHaplotypes.Iterator entryIterator = haplotypes.iterator(); - boolean isFirst = true; - while (entryIterator.hasNext()) { - entryIterator.next(); - final int startIndex = entryIterator.startIndex(); - final byte[] bases = entryIterator.bases(); - changeHaplotypeSuffix(startIndex,bases,startIndex,bases.length); - final double likelihood = calculateLikelihood(readStart, readEnd, startIndex, isFirst); - isFirst = false; - entryIterator.setLikelihood(likelihood); - } - } - - - - @Override - public double calculateLocalLikelihood(final int readStart, final int readEnd, - final int hapStart, final int hapEnd, final boolean kmerMatch) { - if (readBases == null || haplotypeBases == null) - throw new IllegalStateException("read or haplotype was not loaded"); - final int hapSegmentLength = hapEnd - hapStart; - final int readSegmentLength = readEnd - readStart; - // trivial case when there is a single base match. - if (kmerMatch) { - return calculateLocalLikelihoodsExactMatch(readStart, hapStart, hapSegmentLength, readSegmentLength); - } else if (hapSegmentLength == readSegmentLength) { - if (hapSegmentLength == 0) { - return calculateLocalLikelihoodEmptySquare(readStart, readEnd); - } else if (hapSegmentLength == 1) { - return calculateLocalLikelihoodSingleBase(readStart, readEnd, hapStart); - } else { // general (slower) solution. - return calculateLocalLikelihoodsGeneral(readStart, readEnd, hapStart, hapEnd); - } - } else if (hapSegmentLength == 0) { // must be full insertion we - return calculateLocalLikelihoodInsertion(readStart, readEnd); - } else if (readSegmentLength == 0) { // full deletion. - return calculateLocalLikelihoodDeletion(readStart, hapStart, hapEnd); - } else { // general (slower) solution. - return calculateLocalLikelihoodsGeneral(readStart, readEnd, hapStart, hapEnd); - } - } - - /** - * Fast likelihood when the pair-hmm represents a deletion in the read. - */ - private double calculateLocalLikelihoodDeletion(final int readStart, final int hapStart, final int hapEnd) { - double result = INITIAL_CONDITION; - if (readStart > 0) { // no penalty if at the beginning. - result *= transition[readStart][matchToDeletion]; - result *= - StrictMath.pow(transition[readStart][deletionToDeletion],hapEnd - hapStart - 1); - result *= transition[readStart][indelToMatch]; - } - return StrictMath.log10(result) - INITIAL_CONDITION_LOG10; - } - - - /** - * Fast likelihood when the pair-hmm represents a insertion in the read. - */ - private double calculateLocalLikelihoodInsertion(final int readStart, final int readEnd) { - double result = INITIAL_CONDITION; - result *= transition[readStart + 1][matchToInsertion]; - for (int i = readStart + 1; i < readEnd; i++) { - result *= transition[i + 1][insertionToInsertion]; - } - if (readEnd < readBases.length) { - result *= transition[readEnd + 1][indelToMatch]; - } - return StrictMath.log10(result) - INITIAL_CONDITION_LOG10; - } - - /** - * Single base mismatch fast likelihood calculation. - */ - private double calculateLocalLikelihoodSingleBase(final int readStart, final int readEnd, final int hapStart) { - double result = INITIAL_CONDITION; - result *= prior[readStart + 1][hapStart + 1]; - if (readStart > 0) { - result *= transition[readStart + 1][matchToMatch]; - } - if (readEnd < readBases.length) { - result *= transition[readEnd + 1][matchToMatch]; - } - return StrictMath.log10(result) - INITIAL_CONDITION_LOG10; - } - - /** - * Empty square Pair-hmm. - */ - private double calculateLocalLikelihoodEmptySquare(final int readStart, final int readEnd) { - double result = INITIAL_CONDITION; - if (readStart > 0 && readEnd < readBases.length) { - result *= transition[readStart + 1][matchToMatch]; - } - return StrictMath.log10(result) - INITIAL_CONDITION_LOG10; - } - - /** - * Likelihood assuming that there is a exact match between both sequences: read and haplotype - */ - private double calculateLocalLikelihoodsExactMatch(final int readStart, final int hapStart, final int hapSegmentLength, final int readSegmentLength) { - double result = INITIAL_CONDITION; - if (hapSegmentLength == 1) { - result *= prior[readStart + 1][hapStart + 1]; - } else { - for (int i = 0; i < readSegmentLength; i++) { - result *= prior[readStart + i + 1][hapStart + i + 1]; - if (i > 0) { - result *= transition[readStart + i + 1][matchToMatch]; - } - } - } - return StrictMath.log10(result) - INITIAL_CONDITION_LOG10; - } - - /** - * Revert to a general pair-hmm solution. - */ - private double calculateLocalLikelihoodsGeneral(final int readStart, final int readEnd, final int hapStart, final int hapEnd) { - final Problem p = new Problem(readStart, readEnd, hapStart, hapEnd); - final Double cachedCost = cachedResults.get(p); - if (cachedCost != null) { - return cachedCost; - } - double cost = calculateLocalLikelihoodGeneral(p); - cachedResults.put(p, cost); - return cost; - } - - /** - * Resolve the regular full pair-hmm. - * - *

- * With the possibility of reuse the previous haplotype common prefix by using - * a startIndex which is greater than 0. - */ - private double calculateLikelihood(final int readStart, final int readEnd, final int startIndex, final boolean initializeEdges) { - final int edgeStart = initializeEdges ? 0 : startIndex + 1; - initializeMatrixValuesForTrailingProblem(readStart, readEnd, edgeStart); - updateTable(readStart + 1, readEnd + 1, startIndex + 1, haplotypeLength + 1); - if (readEnd == readBases.length) - return finalLikelihoodCalculation(readEnd,0,haplotypeLength + 1) - (readStart == 0 ? StrictMath.log10(haplotypeLength) : 0); - else { - final double divider = 3.0; - final double dividerInverted = 1.0 / divider; - return StrictMath.log10(matchMatrix[readEnd][haplotypeLength] - * transition[readEnd][matchToMatch] * dividerInverted + - insertionMatrix[readEnd][haplotypeLength] - * transition[readEnd][indelToMatch] * dividerInverted + - deletionMatrix[readEnd][haplotypeLength] - * transition[readEnd][indelToMatch] * dividerInverted) - INITIAL_CONDITION_LOG10 + StrictMath.log10(divider); - } - } - - - private double calculateLocalLikelihoodGeneral(final Problem p) { - - initializeMatrixValues(p,null); - // int fromCol = p.hapStart + 1; - // if (previousProblem == null) { - // fromCol = p.hapStart + 1; - // } else { - // final int sharedPrefix = previousProblem.followerStartIndex(p); - // if (sharedPrefix >= 0) - // fromCol = sharedPrefix + 1; - // else - // fromCol = p.hapStart + 1; - // } - // previousProblem = p; - - updateTable(p.readStart + 1, p.readEnd + 1, - p.hapStart + 1, p.hapEnd + 1); - - if (p.trailing) { - return finalLikelihoodCalculation(p.readEnd,p.hapStart,p.hapEnd + 1) - - (p.leading ? StrictMath.log10(p.hapEnd - p.hapStart) : 0); - } else { - final double divider = 3.0; - final double dividerInverted = 1.0 / divider; - return StrictMath.log10(matchMatrix[p.readEnd][p.hapEnd] - * transition[p.readEnd][matchToMatch] * dividerInverted + - insertionMatrix[p.readEnd][p.hapEnd] - * transition[p.readEnd][indelToMatch] * dividerInverted + - deletionMatrix[p.readEnd][p.hapEnd] - * transition[p.readEnd][indelToMatch] * dividerInverted) - INITIAL_CONDITION_LOG10 + StrictMath.log10(divider); - } - } - - private void updateTable(final int rowFrom, final int rowTo, - final int colFrom, final int colTo) { - - for (int i = rowFrom; i < rowTo; i++) { - for (int j = colFrom; j < colTo; j++) { - updateCell(i, j, prior[i][j], transition[i]); - } - } - - } - - /** - * Holds the properties of a pair-hmm computational problem. - */ - public class Problem { - private final byte[] haplotypeSegment; - private final int readStart; - private final int readEnd; - private final int hapStart; - private final int hapEnd; - private final int hashCode; - private final boolean trailing; - private final boolean leading; - - /** - * Construct a new project object. - * @param start inclusive start position on the read to consider. - * @param end exclusive after last position on the read to consider. - * @param hapStart inclusive start position on the haplotype to consider. - * @param hapEnd exclusive after last position on the haplotype to consider. - */ - public Problem(final int start, final int end, final int hapStart, - final int hapEnd) { - if (start < 0 || start > readBases.length) - throw new IllegalArgumentException("bad start index " + start); - if (end < start || end > readBases.length) - throw new IllegalArgumentException("bad end index " + end + " < " + start + " or " + end + " > " + readBases.length); - if (hapStart < 0 || hapStart > haplotypeLength) - throw new IllegalArgumentException("bad hap start index " - + hapStart + " is larger than the haplotypeLength " + haplotypeLength); - if (hapEnd < hapStart || hapEnd > haplotypeLength) - throw new IllegalArgumentException("bad hap end index " - + hapEnd + " outside [" + hapStart + "," - + haplotypeLength + "]"); - - haplotypeSegment = Arrays.copyOfRange(haplotypeBases, hapStart, hapEnd); - readStart = start; - readEnd = end; - this.hapStart = hapStart; - this.hapEnd = hapEnd; - trailing = readEnd == readBases.length; - leading = readStart == 0; - - hashCode = ((start * 31 + end) * 31 + Arrays.hashCode(haplotypeSegment) * 31); - } - - @Override - public int hashCode() { - return hashCode; - } - - @Override - public boolean equals(Object o) { - if (o == this) - return true; - else if (o == null) - return false; - else if (o.getClass() != this.getClass()) - return false; - else { - final Problem p = (Problem) o; - return (p.hashCode == this.hashCode) && (p.readStart == this.readStart) && (p.readEnd == this.readEnd) && Arrays.equals(haplotypeSegment, p.haplotypeSegment); - } - } - - - } - - /** - * Returns the currently loaded read base calls. - * @return {@code never null}. - */ - public byte[] getReadBases() { - if (readBases == null) - throw new IllegalStateException("no read was previously loaded."); - return readBases; - } - - -} diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java deleted file mode 100644 index 125389217..000000000 --- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java +++ /dev/null @@ -1,196 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.utils.pairhmm; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.broadinstitute.sting.utils.QualityUtils; - -/** - * Created with IntelliJ IDEA. - * User: rpoplin, carneiro - * Date: 10/16/12 - */ -public class LoglessPairHMM extends N2MemoryPairHMM { - protected static final double INITIAL_CONDITION = Math.pow(2, 1020); - protected static final double INITIAL_CONDITION_LOG10 = Math.log10(INITIAL_CONDITION); - - // we divide e by 3 because the observed base could have come from any of the non-observed alleles - protected static final double TRISTATE_CORRECTION = 3.0; - - protected static final int matchToMatch = 0; - protected static final int indelToMatch = 1; - protected static final int matchToInsertion = 2; - protected static final int insertionToInsertion = 3; - protected static final int matchToDeletion = 4; - protected static final int deletionToDeletion = 5; - - - /** - * {@inheritDoc} - */ - @Override - public double subComputeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, - final byte[] readBases, - final byte[] readQuals, - final byte[] insertionGOP, - final byte[] deletionGOP, - final byte[] overallGCP, - final int hapStartIndex, - final boolean recacheReadValues, - final int nextHapStartIndex) { - - if (previousHaplotypeBases == null || previousHaplotypeBases.length != haplotypeBases.length) { - final double initialValue = INITIAL_CONDITION / haplotypeBases.length; - // set the initial value (free deletions in the beginning) for the first row in the deletion matrix - for( int j = 0; j < paddedHaplotypeLength; j++ ) { - deletionMatrix[0][j] = initialValue; - } - } - - if ( ! constantsAreInitialized || recacheReadValues ) { - initializeProbabilities(transition, insertionGOP, deletionGOP, overallGCP); - - // note that we initialized the constants - constantsAreInitialized = true; - } - - initializePriors(haplotypeBases, readBases, readQuals, hapStartIndex); - - for (int i = 1; i < paddedReadLength; i++) { - // +1 here is because hapStartIndex is 0-based, but our matrices are 1 based - for (int j = hapStartIndex+1; j < paddedHaplotypeLength; j++) { - updateCell(i, j, prior[i][j], transition[i]); - } - } - - // final probability is the log10 sum of the last element in the Match and Insertion state arrays - // this way we ignore all paths that ended in deletions! (huge) - // but we have to sum all the paths ending in the M and I matrices, because they're no longer extended. - final int endI = paddedReadLength - 1; - double finalSumProbabilities = 0.0; - for (int j = 1; j < paddedHaplotypeLength; j++) { - finalSumProbabilities += matchMatrix[endI][j] + insertionMatrix[endI][j]; - } - return Math.log10(finalSumProbabilities) - INITIAL_CONDITION_LOG10; - } - - /** - * Initializes the matrix that holds all the constants related to the editing - * distance between the read and the haplotype. - * - * @param haplotypeBases the bases of the haplotype - * @param readBases the bases of the read - * @param readQuals the base quality scores of the read - * @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read) - */ - protected void initializePriors(final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, final int startIndex) { - - // initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases - // Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2. - - for (int i = 0; i < readBases.length; i++) { - final byte x = readBases[i]; - final byte qual = readQuals[i]; - for (int j = startIndex; j < haplotypeBases.length; j++) { - final byte y = haplotypeBases[j]; - prior[i+1][j+1] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? - QualityUtils.qualToProb(qual) : (QualityUtils.qualToErrorProb(qual) / (doNotUseTristateCorrection ? 1.0 : TRISTATE_CORRECTION)) ); - } - } - } - - /** - * Initializes the matrix that holds all the constants related to quality scores. - * - * @param insertionGOP insertion quality scores of the read - * @param deletionGOP deletion quality scores of the read - * @param overallGCP overall gap continuation penalty - */ - @Requires({ - "insertionGOP != null", - "deletionGOP != null", - "overallGCP != null" - }) - @Ensures("constantsAreInitialized") - protected static void initializeProbabilities(final double[][] transition, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP) { - for (int i = 0; i < insertionGOP.length; i++) { - final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE); - transition[i+1][matchToMatch] = QualityUtils.qualToProb((byte) qualIndexGOP); - transition[i+1][indelToMatch] = QualityUtils.qualToProb(overallGCP[i]); - transition[i+1][matchToInsertion] = QualityUtils.qualToErrorProb(insertionGOP[i]); - transition[i+1][insertionToInsertion] = QualityUtils.qualToErrorProb(overallGCP[i]); - transition[i+1][matchToDeletion] = QualityUtils.qualToErrorProb(deletionGOP[i]); - transition[i+1][deletionToDeletion] = QualityUtils.qualToErrorProb(overallGCP[i]); - //TODO it seems that it is not always the case that matchToMatch + matchToDeletion + matchToInsertion == 1. - //TODO We have detected cases of 1.00002 which can cause problems downstream. This are typically masked - //TODO by the fact that we always add a indelToMatch penalty to all PairHMM likelihoods (~ -0.1) - //TODO This is in fact not well justified and although it does not have any effect (since is equally added to all - //TODO haplotypes likelihoods) perhaps we should just remove it eventually and fix this != 1.0 issue here. - } - } - - /** - * Updates a cell in the HMM matrix - * - * The read and haplotype indices are offset by one because the state arrays have an extra column to hold the - * initial conditions - - * @param indI row index in the matrices to update - * @param indJ column index in the matrices to update - * @param prior the likelihood editing distance matrix for the read x haplotype - * @param transition an array with the six transition relevant to this location - */ - protected void updateCell( final int indI, final int indJ, final double prior, final double[] transition) { - - matchMatrix[indI][indJ] = prior * ( matchMatrix[indI - 1][indJ - 1] * transition[matchToMatch] + - insertionMatrix[indI - 1][indJ - 1] * transition[indelToMatch] + - deletionMatrix[indI - 1][indJ - 1] * transition[indelToMatch] ); - insertionMatrix[indI][indJ] = matchMatrix[indI - 1][indJ] * transition[matchToInsertion] + insertionMatrix[indI - 1][indJ] * transition[insertionToInsertion]; - deletionMatrix[indI][indJ] = matchMatrix[indI][indJ - 1] * transition[matchToDeletion] + deletionMatrix[indI][indJ - 1] * transition[deletionToDeletion]; - } -} diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java b/protected/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java deleted file mode 100644 index 6cbbbd089..000000000 --- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java +++ /dev/null @@ -1,158 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.utils.recalibration; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.LRUCache; - -/** - * The object temporarily held by a read that describes all of it's covariates. - * - * In essence, this is an array of CovariateValues, but it also has some functionality to deal with the optimizations of the NestedHashMap - * - * @author Mauricio Carneiro - * @since 2/8/12 - */ -public class ReadCovariates { - private final static Logger logger = Logger.getLogger(ReadCovariates.class); - - /** - * How big should we let the LRU cache grow - */ - private static final int LRU_CACHE_SIZE = 500; - - /** - * Use an LRU cache to keep cache of keys (int[][][]) arrays for each read length we've seen. - * The cache allows us to avoid the expense of recreating these arrays for every read. The LRU - * keeps the total number of cached arrays to less than LRU_CACHE_SIZE. - * - * This is a thread local variable, so the total memory required may grow to N_THREADS x LRU_CACHE_SIZE - */ - private final static ThreadLocal> keysCache = new ThreadLocal>() { - @Override protected LRUCache initialValue() { - return new LRUCache(LRU_CACHE_SIZE); - } - }; - - /** - * Our keys, indexed by event type x read length x covariate - */ - private final int[][][] keys; - - /** - * The index of the current covariate, used by addCovariate - */ - private int currentCovariateIndex = 0; - - public ReadCovariates(final int readLength, final int numberOfCovariates) { - final LRUCache cache = keysCache.get(); - final int[][][] cachedKeys = cache.get(readLength); - if ( cachedKeys == null ) { - // There's no cached value for read length so we need to create a new int[][][] array - if ( logger.isDebugEnabled() ) logger.debug("Keys cache miss for length " + readLength + " cache size " + cache.size()); - keys = new int[EventType.values().length][readLength][numberOfCovariates]; - cache.put(readLength, keys); - } else { - keys = cachedKeys; - } - } - - public void setCovariateIndex(final int index) { - currentCovariateIndex = index; - } - - /** - * Update the keys for mismatch, insertion, and deletion for the current covariate at read offset - * - * @param mismatch the mismatch key value - * @param insertion the insertion key value - * @param deletion the deletion key value - * @param readOffset the read offset, must be >= 0 and <= the read length used to create this ReadCovariates - */ - public void addCovariate(final int mismatch, final int insertion, final int deletion, final int readOffset) { - keys[EventType.BASE_SUBSTITUTION.ordinal()][readOffset][currentCovariateIndex] = mismatch; - keys[EventType.BASE_INSERTION.ordinal()][readOffset][currentCovariateIndex] = insertion; - keys[EventType.BASE_DELETION.ordinal()][readOffset][currentCovariateIndex] = deletion; - } - - /** - * Get the keys for all covariates at read position for error model - * - * @param readPosition - * @param errorModel - * @return - */ - public int[] getKeySet(final int readPosition, final EventType errorModel) { - return keys[errorModel.ordinal()][readPosition]; - } - - public int[][] getKeySet(final EventType errorModel) { - return keys[errorModel.ordinal()]; - } - - // ---------------------------------------------------------------------- - // - // routines for testing - // - // ---------------------------------------------------------------------- - - protected int[][] getMismatchesKeySet() { return getKeySet(EventType.BASE_SUBSTITUTION); } - protected int[][] getInsertionsKeySet() { return getKeySet(EventType.BASE_INSERTION); } - protected int[][] getDeletionsKeySet() { return getKeySet(EventType.BASE_DELETION); } - - protected int[] getMismatchesKeySet(final int readPosition) { - return getKeySet(readPosition, EventType.BASE_SUBSTITUTION); - } - - protected int[] getInsertionsKeySet(final int readPosition) { - return getKeySet(readPosition, EventType.BASE_INSERTION); - } - - protected int[] getDeletionsKeySet(final int readPosition) { - return getKeySet(readPosition, EventType.BASE_DELETION); - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java deleted file mode 100644 index 58c3bb9bd..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ /dev/null @@ -1,396 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.annotator; - -import org.broad.tribble.readers.LineIterator; -import org.broad.tribble.readers.PositionalBufferedStream; -import org.broadinstitute.sting.WalkerTest; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.vcf.VCFCodec; -import org.testng.Assert; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.util.Arrays; - -public class VariantAnnotatorIntegrationTest extends WalkerTest { - - final static String REF = b37KGReference; - final static String CEUTRIO_BAM = validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam"; - - public static String baseTestString() { - return "-T VariantAnnotator -R " + b36KGReference + " --no_cmdline_in_header -o %s"; - } - - @Test - public void testHasAnnotsNotAsking1() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " --variant " + privateTestDir + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("360610e4990860bb5c45249b8ac31e5b")); - executeTest("test file has annotations, not asking for annotations, #1", spec); - } - - @Test - public void testHasAnnotsNotAsking2() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " --variant " + privateTestDir + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("d69a3c92a0e8f44e09e7377e3eaed4e8")); - executeTest("test file has annotations, not asking for annotations, #2", spec); - } - - @Test - public void testHasAnnotsAsking1() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("823868a4b5b5ec2cdf080c059d04d31a")); - executeTest("test file has annotations, asking for annotations, #1", spec); - } - - @Test - public void testHasAnnotsAsking2() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("213560f395280e6a066d0b0497ce8881")); - executeTest("test file has annotations, asking for annotations, #2", spec); - } - - @Test - public void testNoAnnotsNotAsking1() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " --variant " + privateTestDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("540a9be8a8cb85b0f675fea1184bf78c")); - executeTest("test file doesn't have annotations, not asking for annotations, #1", spec); - } - - @Test - public void testNoAnnotsNotAsking2() { - // the genotype annotations in this file are actually out of order. If you don't parse the genotypes - // they don't get reordered. It's a good test of the genotype ordering system. - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " --variant " + privateTestDir + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("f900e65b65ff0f9d9eb0891ef9b28c73")); - executeTest("test file doesn't have annotations, not asking for annotations, #2", spec); - } - - @Test - public void testNoAnnotsAsking1() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("6f873b3152db291e18e3a04fbce2e117")); - executeTest("test file doesn't have annotations, asking for annotations, #1", spec); - } - - @Test - public void testNoAnnotsAsking2() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("d8089c5874ff35a7fd7e35ebd7d3b137")); - executeTest("test file doesn't have annotations, asking for annotations, #2", spec); - } - - @Test - public void testExcludeAnnotations() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -G Standard -XA FisherStrand -XA ReadPosRankSumTest --variant " + privateTestDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("552c2ad9dbfaa85d51d2def93c8229c6")); - executeTest("test exclude annotations", spec); - } - - @Test - public void testOverwritingHeader() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample4.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,001,292", 1, - Arrays.asList("0ed4c7760f6e7a158b6d743d257300f3")); - executeTest("test overwriting header", spec); - } - - @Test - public void testNoReads() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1, - Arrays.asList("1c423b7730b9805e7b885ece924286e0")); - executeTest("not passing it any reads", spec); - } - - @Test - public void testDBTagWithDbsnp() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " --dbsnp " + b36dbSNP129 + " -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1, - Arrays.asList("54d7d5bb9404652857adf5e50d995f30")); - executeTest("getting DB tag with dbSNP", spec); - } - - @Test - public void testMultipleIdsWithDbsnp() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " --alwaysAppendDbsnpId --dbsnp " + b36dbSNP129 + " -G Standard --variant " + privateTestDir + "vcfexample3withIDs.vcf -L " + privateTestDir + "vcfexample3withIDs.vcf", 1, - Arrays.asList("5fe63e511061ed4f91d938e72e7e3c39")); - executeTest("adding multiple IDs with dbSNP", spec); - } - - @Test - public void testDBTagWithHapMap() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " --comp:H3 " + privateTestDir + "fakeHM3.vcf -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1, - Arrays.asList("cc7184263975595a6e2473d153227146")); - executeTest("getting DB tag with HM3", spec); - } - - @Test - public void testDBTagWithTwoComps() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " --comp:H3 " + privateTestDir + "fakeHM3.vcf --comp:foo " + privateTestDir + "fakeHM3.vcf -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1, - Arrays.asList("6afbf05090ae139f53467cf6e0e71cf4")); - executeTest("getting DB tag with 2 comps", spec); - } - - @Test - public void testNoQuals() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " --variant " + privateTestDir + "noQual.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L " + privateTestDir + "noQual.vcf -A QualByDepth", 1, - Arrays.asList("aea983adc01cd059193538cc30adc17d")); - executeTest("test file doesn't have QUALs", spec); - } - - @Test - public void testUsingExpression() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " --resource:foo " + privateTestDir + "targetAnnotations.vcf -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -E foo.AF -L " + privateTestDir + "vcfexample3empty.vcf", 1, - Arrays.asList("2b0e8cdfd691779befc5ac123d1a1887")); - executeTest("using expression", spec); - } - - @Test - public void testUsingExpressionWithID() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " --resource:foo " + privateTestDir + "targetAnnotations.vcf -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -E foo.ID -L " + privateTestDir + "vcfexample3empty.vcf", 1, - Arrays.asList("3de1d1998203518098ffae233f3e2352")); - executeTest("using expression with ID", spec); - } - - @Test - public void testTabixAnnotationsAndParallelism() { - final String MD5 = "99938d1e197b8f10c408cac490a00a62"; - for ( String file : Arrays.asList("CEU.exon.2010_03.sites.vcf", "CEU.exon.2010_03.sites.vcf.gz")) { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -A HomopolymerRun --variant:vcf " + validationDataLocation + file + " -L " + validationDataLocation + "CEU.exon.2010_03.sites.vcf --no_cmdline_in_header", 1, - Arrays.asList(MD5)); - executeTest("Testing lookup vcf tabix vs. vcf tribble", spec); - } - - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -A HomopolymerRun -nt 2 --variant:vcf " + validationDataLocation + "CEU.exon.2010_03.sites.vcf -L " + validationDataLocation + "CEU.exon.2010_03.sites.vcf --no_cmdline_in_header", 1, - Arrays.asList(MD5)); - - executeTest("Testing lookup vcf tabix vs. vcf tribble plus parallelism", spec); - } - - @Test - public void testSnpEffAnnotations() { - WalkerTestSpec spec = new WalkerTestSpec( - "-T VariantAnnotator -R " + hg19Reference + " --no_cmdline_in_header -o %s -A SnpEff --variant " + - validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf --snpEffFile " + validationDataLocation + - "snpEff2.0.5.AFR.unfiltered.vcf -L 1:1-1,500,000 -L 2:232,325,429", - 1, - Arrays.asList("d9291845ce5a8576898d293a829a05b7") - ); - executeTest("Testing SnpEff annotations", spec); - } - - @Test - public void testSnpEffAnnotationsUnsupportedVersionGATKMode() { - WalkerTestSpec spec = new WalkerTestSpec( - "-T VariantAnnotator -R " + b37KGReference + " --no_cmdline_in_header -o %s -A SnpEff " + - "--variant " + privateTestDir + "vcf4.1.example.vcf " + - "--snpEffFile " + privateTestDir + "snpEff_unsupported_version_gatk_mode.vcf " + - "-L 1:10001292-10012424", - 1, - Arrays.asList("7352cf23a4d45d3d2bb34ab44a4100ae") - ); - executeTest("Testing SnpEff annotations (unsupported version, GATK mode)", spec); - } - - @Test - public void testSnpEffAnnotationsUnsupportedVersionNoGATKMode() { - WalkerTestSpec spec = new WalkerTestSpec( - "-T VariantAnnotator -R " + b37KGReference + " --no_cmdline_in_header -o %s -A SnpEff " + - "--variant " + privateTestDir + "vcf4.1.example.vcf " + - "--snpEffFile " + privateTestDir + "snpEff_unsupported_version_no_gatk_mode.vcf " + - "-L 1:10001292-10012424", - 1, - UserException.class - ); - executeTest("Testing SnpEff annotations (unsupported version, no GATK mode)", spec); - } - - @Test - public void testTDTAnnotation() { - final String MD5 = "427dfdc665359b67eff210f909ebf8a2"; - WalkerTestSpec spec = new WalkerTestSpec( - "-T VariantAnnotator -R " + b37KGReference + " -A TransmissionDisequilibriumTest --variant:vcf " + privateTestDir + "ug.random50000.subset300bp.chr1.family.vcf" + - " -L " + privateTestDir + "ug.random50000.subset300bp.chr1.family.vcf --no_cmdline_in_header -ped " + privateTestDir + "ug.random50000.family.ped -o %s", 1, - Arrays.asList(MD5)); - executeTest("Testing TDT annotation ", spec); - } - - - @Test - public void testChromosomeCountsPed() { - final String MD5 = "6b5cbedf4a8b3385edf128d81c8a46f2"; - WalkerTestSpec spec = new WalkerTestSpec( - "-T VariantAnnotator -R " + b37KGReference + " -A ChromosomeCounts --variant:vcf " + privateTestDir + "ug.random50000.subset300bp.chr1.family.vcf" + - " -L " + privateTestDir + "ug.random50000.subset300bp.chr1.family.vcf --no_cmdline_in_header -ped " + privateTestDir + "ug.random50000.family.ped -o %s", 1, - Arrays.asList(MD5)); - executeTest("Testing ChromosomeCounts annotation with PED file", spec); - } - - @Test - public void testInbreedingCoeffPed() { - final String MD5 = "159a771c1deaeffb786097e106943893"; - WalkerTestSpec spec = new WalkerTestSpec( - "-T VariantAnnotator -R " + b37KGReference + " -A InbreedingCoeff --variant:vcf " + privateTestDir + "ug.random50000.subset300bp.chr1.family.vcf" + - " -L " + privateTestDir + "ug.random50000.subset300bp.chr1.family.vcf --no_cmdline_in_header -ped " + privateTestDir + "ug.random50000.family.ped -o %s", 1, - Arrays.asList(MD5)); - executeTest("Testing InbreedingCoeff annotation with PED file", spec); - } - - @Test - public void testStrandBiasBySample() throws IOException { - final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, CEUTRIO_BAM) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800"; - final WalkerTestSpec spec = new WalkerTestSpec(base, 1, Arrays.asList("")); - final File outputVCF = executeTest("testStrandBiasBySample", spec).getFirst().get(0); - - final String baseNoFS = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, CEUTRIO_BAM) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800 -XA FisherStrand -A StrandBiasBySample"; - final WalkerTestSpec specNoFS = new WalkerTestSpec(baseNoFS, 1, Arrays.asList("")); - specNoFS.disableShadowBCF(); - final File outputVCFNoFS = executeTest("testStrandBiasBySample component stand bias annotation", specNoFS).getFirst().get(0); - - final String baseAnn = String.format("-T VariantAnnotator -R %s -V %s", REF, outputVCFNoFS.getAbsolutePath()) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800 -A FisherStrand"; - final WalkerTestSpec specAnn = new WalkerTestSpec(baseAnn, 1, Arrays.asList("")); - specAnn.disableShadowBCF(); - final File outputVCFAnn = executeTest("testStrandBiasBySample re-annotation of FisherStrand", specAnn).getFirst().get(0); - - // confirm that the FisherStrand values are identical for the two pipelines - final VCFCodec codec = new VCFCodec(); - final FileInputStream s = new FileInputStream(outputVCF); - final LineIterator lineIterator = codec.makeSourceFromStream(new PositionalBufferedStream(s)); - codec.readHeader(lineIterator); - - final VCFCodec codecAnn = new VCFCodec(); - final FileInputStream sAnn = new FileInputStream(outputVCFAnn); - final LineIterator lineIteratorAnn = codecAnn.makeSourceFromStream(new PositionalBufferedStream(sAnn)); - codecAnn.readHeader(lineIteratorAnn); - - while( lineIterator.hasNext() && lineIteratorAnn.hasNext() ) { - final String line = lineIterator.next(); - Assert.assertFalse(line == null); - final VariantContext vc = codec.decode(line); - - final String lineAnn = lineIteratorAnn.next(); - Assert.assertFalse(lineAnn == null); - final VariantContext vcAnn = codecAnn.decode(lineAnn); - - Assert.assertTrue(vc.hasAttribute("FS")); - Assert.assertTrue(vcAnn.hasAttribute("FS")); - Assert.assertEquals(vc.getAttributeAsDouble("FS", 0.0), vcAnn.getAttributeAsDouble("FS", -1.0)); - } - - Assert.assertFalse(lineIterator.hasNext()); - Assert.assertFalse(lineIteratorAnn.hasNext()); - } - - @Test - public void testQualByDepth() throws IOException { - final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, CEUTRIO_BAM) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800"; - final WalkerTestSpec spec = new WalkerTestSpec(base, 1, Arrays.asList("")); - final File outputVCF = executeTest("testQualByDepth", spec).getFirst().get(0); - - final String baseNoQD = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, CEUTRIO_BAM) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800 -XA QualByDepth"; - final WalkerTestSpec specNoQD = new WalkerTestSpec(baseNoQD, 1, Arrays.asList("")); - specNoQD.disableShadowBCF(); - final File outputVCFNoQD = executeTest("testQualByDepth calling without QD", specNoQD).getFirst().get(0); - - final String baseAnn = String.format("-T VariantAnnotator -R %s -V %s", REF, outputVCFNoQD.getAbsolutePath()) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800 -A QualByDepth"; - final WalkerTestSpec specAnn = new WalkerTestSpec(baseAnn, 1, Arrays.asList("139a4384f5a7c1f49ada67f416642249")); - specAnn.disableShadowBCF(); - final File outputVCFAnn = executeTest("testQualByDepth re-annotation of QD", specAnn).getFirst().get(0); - - // confirm that the QD values are present in the new file for all biallelic variants - // QD values won't be identical because some filtered reads are missing during re-annotation - - final VCFCodec codec = new VCFCodec(); - final FileInputStream s = new FileInputStream(outputVCF); - final LineIterator lineIterator = codec.makeSourceFromStream(new PositionalBufferedStream(s)); - codec.readHeader(lineIterator); - - final VCFCodec codecAnn = new VCFCodec(); - final FileInputStream sAnn = new FileInputStream(outputVCFAnn); - final LineIterator lineIteratorAnn = codecAnn.makeSourceFromStream(new PositionalBufferedStream(sAnn)); - codecAnn.readHeader(lineIteratorAnn); - - while( lineIterator.hasNext() && lineIteratorAnn.hasNext() ) { - final String line = lineIterator.next(); - Assert.assertFalse(line == null); - final VariantContext vc = codec.decode(line); - - final String lineAnn = lineIteratorAnn.next(); - Assert.assertFalse(lineAnn == null); - final VariantContext vcAnn = codecAnn.decode(lineAnn); - - if( vc.isBiallelic() ) { - Assert.assertTrue(vc.hasAttribute("QD")); - Assert.assertTrue(vcAnn.hasAttribute("QD")); - } - } - - Assert.assertFalse(lineIterator.hasNext()); - Assert.assertFalse(lineIteratorAnn.hasNext()); - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/WalkerTestIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/WalkerTestIntegrationTest.java deleted file mode 100644 index fb15e9835..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/WalkerTestIntegrationTest.java +++ /dev/null @@ -1,80 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.annotator; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; - -import java.util.Arrays; - -public class WalkerTestIntegrationTest extends WalkerTest { - - public void testBadMD5(String md5) { - WalkerTestSpec spec = new WalkerTestSpec("FAIL", Arrays.asList(md5)); - executeTest("", spec); - } - - @Test(expectedExceptions = RuntimeException.class) - public void testNullMD5() { - testBadMD5(null); - } - - @Test(expectedExceptions = RuntimeException.class) - public void testBadLengthMD5() { - testBadMD5("asdfasdfa"); - } - - @Test(expectedExceptions = RuntimeException.class) - public void testSpacesMD5() { - testBadMD5("1de8e943fbf55246ebd19efa32f22a58 "); - } - - @Test(expectedExceptions = RuntimeException.class) - public void testBadCharMD5() { - testBadMD5("1de8e943fbf55246ebd19efa32f22a5_"); - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfoUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfoUnitTest.java deleted file mode 100644 index 12fa2525f..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfoUnitTest.java +++ /dev/null @@ -1,131 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.bqsr; - -import net.sf.samtools.SAMUtils; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.recalibration.EventType; -import org.broadinstitute.sting.utils.recalibration.ReadCovariates; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.EnumMap; -import java.util.List; - -public final class ReadRecalibrationInfoUnitTest extends BaseTest { - @DataProvider(name = "InfoProvider") - public Object[][] createCombineTablesProvider() { - List tests = new ArrayList(); - - for ( final int readLength: Arrays.asList(10, 100, 1000) ) { - for ( final boolean includeIndelErrors : Arrays.asList(true, false) ) { - tests.add(new Object[]{readLength, includeIndelErrors}); - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "InfoProvider") - public void testReadInfo(final int readLength, final boolean includeIndelErrors) { - final ReadCovariates covariates = new ReadCovariates(readLength, 2); - - final byte[] bases = new byte[readLength]; - final byte[] baseQuals = new byte[readLength]; - final byte[] insertionQuals = new byte[readLength]; - final byte[] deletionQuals = new byte[readLength]; - final boolean[] skips = new boolean[readLength]; - final double[] snpErrors = new double[readLength]; - final double[] insertionErrors = new double[readLength]; - final double[] deletionsErrors = new double[readLength]; - for ( int i = 0; i < readLength; i++ ) { - bases[i] = 'A'; - baseQuals[i] = (byte)(i % SAMUtils.MAX_PHRED_SCORE); - insertionQuals[i] = (byte)((i+1) % SAMUtils.MAX_PHRED_SCORE); - deletionQuals[i] = (byte)((i+2) % SAMUtils.MAX_PHRED_SCORE); - skips[i] = i % 2 == 0; - snpErrors[i] = 1.0 / (i+1); - insertionErrors[i] = 0.5 / (i+1); - deletionsErrors[i] = 0.3 / (i+1); - } - - final EnumMap errors = new EnumMap(EventType.class); - errors.put(EventType.BASE_SUBSTITUTION, snpErrors); - errors.put(EventType.BASE_INSERTION, insertionErrors); - errors.put(EventType.BASE_DELETION, deletionsErrors); - - final EnumMap quals = new EnumMap(EventType.class); - quals.put(EventType.BASE_SUBSTITUTION, baseQuals); - quals.put(EventType.BASE_INSERTION, insertionQuals); - quals.put(EventType.BASE_DELETION, deletionQuals); - - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, baseQuals, readLength + "M"); - if ( includeIndelErrors ) { - read.setBaseQualities(insertionQuals, EventType.BASE_INSERTION); - read.setBaseQualities(deletionQuals, EventType.BASE_DELETION); - } - - final ReadRecalibrationInfo info = new ReadRecalibrationInfo(read, covariates, skips, snpErrors, insertionErrors, deletionsErrors); - - Assert.assertEquals(info.getCovariatesValues(), covariates); - Assert.assertEquals(info.getRead(), read); - - for ( int i = 0; i < readLength; i++ ) { - Assert.assertEquals(info.skip(i), skips[i]); - for ( final EventType et : EventType.values() ) { - Assert.assertEquals(info.getErrorFraction(et, i), errors.get(et)[i]); - final byte expectedQual = et == EventType.BASE_SUBSTITUTION || includeIndelErrors ? quals.get(et)[i]: GATKSAMRecord.DEFAULT_INSERTION_DELETION_QUAL; - Assert.assertEquals(info.getQual(et, i), expectedQual); - } - } - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java deleted file mode 100644 index 460b80121..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java +++ /dev/null @@ -1,84 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; - -import static org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperGeneralPloidyTestExecutor.LSV_ALLELES; - -/** - * Created by IntelliJ IDEA. - * User: delangel - * Date: 4/5/12 - * Time: 11:28 AM - * To change this template use File | Settings | File Templates. - */ -public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTest { - - private final UnifiedGenotyperGeneralPloidyTestExecutor executor = new UnifiedGenotyperGeneralPloidyTestExecutor(); - - @Test(enabled = true) - public void testSNP_ACS_Pools() { - executor.PC_LSV_Test_short(" -maxAltAlleles 1 -ploidy 6 -out_mode EMIT_ALL_CONFIDENT_SITES", "LSV_SNP_ACS", "SNP", "df0e67c975ef74d593f1c704daab1705"); - } - - @Test(enabled = true) - public void testBOTH_GGA_Pools() { - executor.PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_BOTH_GGA", "BOTH", "dac2d7969e109aee9ad2dad573759f58"); - } - - @Test(enabled = true) - public void testINDEL_GGA_Pools() { - executor.PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_INDEL_GGA", "INDEL", "ceb105e3db0f2b993e3d725b0d60b6a3"); - } - - @Test(enabled = true) - public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() { - executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "4dd1b38f0389e339ce8a05956956aa8a"); - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java deleted file mode 100644 index 48f36ccc6..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java +++ /dev/null @@ -1,73 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; - -import static org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperGeneralPloidyTestExecutor.CEUTRIO_BAM; -import static org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperGeneralPloidyTestExecutor.NA12891_CALLS; - -public class UnifiedGenotyperGeneralPloidySuite2IntegrationTest extends WalkerTest { - - private final UnifiedGenotyperGeneralPloidyTestExecutor executor = new UnifiedGenotyperGeneralPloidyTestExecutor(); - - @Test(enabled = true) - public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() { - executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","39f559996f8d429839c585bbab68dbde"); - } - - @Test(enabled = true) - public void testMT_SNP_DISCOVERY_sp4() { - executor.PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","5d55b71688a0777a7c0247c376401368"); - } - - @Test(enabled = true) - public void testMT_SNP_GGA_sp10() { - executor.PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "cf336d66a109c55f90e9ed2b3bc196c8"); - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java deleted file mode 100644 index 6219eb578..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java +++ /dev/null @@ -1,208 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; - -import java.io.File; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; - -public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { - - private final static String baseCommandIndels = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm INDEL -mbq 20 -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; - private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132; - - // -------------------------------------------------------------------------------------------------------------- - // - // testing indel caller - // - // -------------------------------------------------------------------------------------------------------------- - // Basic indel testing with SLX data - @Test - public void testSimpleIndels() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommandIndels + - " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam" + - " -o %s" + - " -L 1:10,000,000-10,500,000", - 1, - Arrays.asList("3c8727ee6e2a6f10ab728c4869dd5b92")); - - executeTest(String.format("test indel caller in SLX"), spec); - } - - // Basic indel testing with SLX data - @Test - public void testIndelsWithLowMinAlleleCnt() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommandIndels + - " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam" + - " -o %s" + - " -minIndelCnt 1" + - " -L 1:10,000,000-10,100,000", - 1, - Arrays.asList("0cbe889e03bab6512680ecaebd52c536")); - - executeTest(String.format("test indel caller in SLX with low min allele count"), spec); - } - - @Test - public void testMultiTechnologyIndels() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommandIndels + - " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" + - " -o %s" + - " -L 1:10,000,000-10,500,000", - 1, - Arrays.asList("e10c49fcf9a128745c2b050a52798e58")); - - executeTest(String.format("test indel calling, multiple technologies"), spec); - } - - @Test - public void testWithIndelAllelesPassedIn1() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + - "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("475f8148123792064130faf9f9030fec")); - executeTest("test MultiSample Pilot2 indels with alleles passed in", spec); - } - - @Test - public void testWithIndelAllelesPassedIn2() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " - + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + - "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("a7e4e1bd128424d46cffdd538b220074")); - executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec); - } - - @Test(timeOut = 20*1000*60) // this guy can take a long time because it's two steps, so give it 12 minutes - public void testMultiSampleIndels1() { - // since we're going to test the MD5s with GGA only do one here - WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( - baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("")); - List result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst(); - - WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( - baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + - "low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1, - Arrays.asList("903af514f70db9238064da311c4ea0de")); - executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); - } - - @Test - public void testGGAwithNoEvidenceInReads() { - final String vcf = "small.indel.test.vcf"; - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + privateTestDir + vcf + " -I " + validationDataLocation + - "NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam -o %s -L " + validationDataLocation + vcf, 1, - Arrays.asList("d76eacc4021b78ccc0a9026162e814a7")); - executeTest("test GENOTYPE_GIVEN_ALLELES with no evidence in reads", spec); - } - - @Test - public void testBaseIndelQualityScores() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommandIndelsb37 + - " -I " + privateTestDir + "NA12878.100kb.BQSRv2.example.bam" + - " -o %s" + - " -L 20:10,000,000-10,100,000", - 1, - Arrays.asList("8a7966e4b67334bca6083670c5a16b67")); - - executeTest(String.format("test UG with base indel quality scores"), spec); - } - - // -------------------------------------------------------------------------------------------------------------- - // - // testing MinIndelFraction - // - // -------------------------------------------------------------------------------------------------------------- - - final static String assessMinIndelFraction = baseCommandIndelsb37 + " -I " + validationDataLocation - + "978604.bam -L 1:978,586-978,626 -o %s --sites_only -rf Sample -goodSM 7377 -goodSM 22-0022 -goodSM 134 -goodSM 344029-53 -goodSM 14030"; - - @Test - public void testMinIndelFraction0() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - assessMinIndelFraction + " -minIndelFrac 0.0", 1, - Arrays.asList("d3721bee5edaa31fdd35edd7aa75feb3")); - executeTest("test minIndelFraction 0.0", spec); - } - - @Test - public void testMinIndelFraction25() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - assessMinIndelFraction + " -minIndelFrac 0.25", 1, - Arrays.asList("a5b6d7b32953500d936d3dff512a6254")); - executeTest("test minIndelFraction 0.25", spec); - } - - @Test - public void testMinIndelFraction100() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - assessMinIndelFraction + " -minIndelFrac 1", 1, - Arrays.asList("3f07efb768e08650a7ce333edd4f9a52")); - executeTest("test minIndelFraction 1.0", spec); - } - - // No testing of MD5 here, we previously blew up due to a 0 length haplotypes, so we just need to pass - @Test - public void testHaplotype0Length() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper --disableDithering -I " + privateTestDir + "haplotype0.bam -L 20:47507681 -R " + b37KGReference + " -baq CALCULATE_AS_NECESSARY -glm BOTH -o /dev/null", - 0, - Collections.emptyList()); - executeTest("testHaplotype0Length", spec); - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java deleted file mode 100644 index dcaed8bf2..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ /dev/null @@ -1,385 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import net.sf.samtools.util.BlockCompressedInputStream; -import org.broad.tribble.readers.AsciiLineReader; -import org.broadinstitute.sting.WalkerTest; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variant.GATKVCFUtils; -import org.broadinstitute.variant.variantcontext.Genotype; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.testng.Assert; -import org.testng.annotations.Test; - -import java.io.File; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; - -// ********************************************************************************** // -// Note that this class also serves as an integration test for the VariantAnnotator! // -// ********************************************************************************** // - -public class UnifiedGenotyperIntegrationTest extends WalkerTest { - - private final static String baseCommand = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; - private final static String baseCommandNoCmdLineHeaderStdout = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam"; - - // -------------------------------------------------------------------------------------------------------------- - // - // testing parameters - // - // -------------------------------------------------------------------------------------------------------------- - - @Test - public void testMinBaseQualityScore() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --min_base_quality_score 26", 1, - Arrays.asList("30be17df00acc8a92223f51fe7c1bdf7")); - executeTest("test min_base_quality_score 26", spec); - } - - @Test - public void testSLOD() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --computeSLOD --no_cmdline_in_header -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("bc8a4e4ceb46776169b47146805c882a")); - executeTest("test SLOD", spec); - } - - @Test - public void testNDA() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommand + " --annotateNDA -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("17f65eca1e6c1f06919a58f230b6d8d3")); - executeTest("test NDA", spec); - } - - @Test - public void testCompTrack() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -comp:FOO " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("21185d9a7519356ba672757f5a522971")); - executeTest("test using comp track", spec); - } - - @Test(enabled = false) // EB: for some reason this test crashes whenever I run it on my local machine - public void testNoCmdLineHeaderStdout() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommandNoCmdLineHeaderStdout + " -glm INDEL -L 1:67,225,396-67,288,518", 0, - Collections.emptyList()); - executeTest("testNoCmdLineHeaderStdout", spec); - } - - @Test - public void testOutputParameterSitesOnly() { - testOutputParameters("-sites_only", "48cd40d3994911a6f2609bfd375e1d2d"); - } - - @Test - public void testOutputParameterAllConfident() { - testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "28f40ce47651f504158fc4e5bb58df4b"); - } - - @Test - public void testOutputParameterAllSites() { - testOutputParameters("--output_mode EMIT_ALL_SITES", "5259dafaa1b57d9489003b16a48e35f8"); - } - - private void testOutputParameters(final String args, final String md5) { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 " + args, 1, - Arrays.asList(md5)); - executeTest(String.format("testParameter[%s]", args), spec); - } - - @Test - public void testConfidence() { - WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( - baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1, - Arrays.asList("918109938ef355d759dafc3ebb47d8a5")); - executeTest("test confidence 1", spec1); - } - - @Test - public void testNoPrior() { - WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( - baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -inputPrior 0.33333 -inputPrior 0.33333", 1, - Arrays.asList("7ac60bdc355d97c0939e644b58de47d7")); - executeTest("test no prior 1", spec1); - - } - @Test - public void testUserPrior() { - WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( - baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -inputPrior 0.001 -inputPrior 0.495", 1, - Arrays.asList("04d05900849d5a3f6f3f98bd0f262369")); - executeTest("test user prior 1", spec1); - - } - - @Test - public void emitPLsAtAllSites() { - WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( - baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --output_mode EMIT_ALL_SITES -allSitePLs", 1, - Arrays.asList("552aced1b1ef7e4a554223f4719f9560")); - // GDA: TODO: BCF encoder/decoder doesn't seem to support non-standard values in genotype fields. IE even if there is a field defined in FORMAT and in the header the BCF2 encoder will still fail - spec1.disableShadowBCF(); - - executeTest("test all site PLs 1", spec1); - - } - // -------------------------------------------------------------------------------------------------------------- - // - // testing heterozygosity - // - // -------------------------------------------------------------------------------------------------------------- - @Test - public void testHeterozyosity1() { - testHeterozosity( 0.01, "2f3051caa785c7c1e2a8b23fa4da90b1" ); - } - - @Test - public void testHeterozyosity2() { - testHeterozosity( 1.0 / 1850, "228df9e38580d8ffe1134da7449fa35e" ); - } - - private void testHeterozosity(final double arg, final String md5) { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000 --heterozygosity " + arg, 1, - Arrays.asList(md5)); - executeTest(String.format("test heterozyosity[%s]", arg), spec); - } - - // -------------------------------------------------------------------------------------------------------------- - // - // testing compressed output - // - // -------------------------------------------------------------------------------------------------------------- - - private final static String COMPRESSED_OUTPUT_MD5 = "eebec02fdde9937bffaf44902ace6207"; - - @Test - public void testCompressedOutput() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("gz"), Arrays.asList(COMPRESSED_OUTPUT_MD5)); - executeTest("test compressed output", spec); - } - - // -------------------------------------------------------------------------------------------------------------- - // - // testing parallelization - // - // -------------------------------------------------------------------------------------------------------------- - - @Test - public void testParallelization() { - - // Note that we need to turn off any randomization for this to work, so no downsampling and no annotations - - String md5 = "1f3fad09a63269c36e871e7ee04ebfaa"; - final String myCommand = "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; - - WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( - myCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1, - Arrays.asList(md5)); - executeTest("test parallelization (single thread)", spec1); - - GenomeAnalysisEngine.resetRandomGenerator(); - - WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( - myCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000 -nt 2", 1, - Arrays.asList(md5)); - executeTest("test parallelization (2 threads)", spec2); - - GenomeAnalysisEngine.resetRandomGenerator(); - - WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( - myCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000 -nt 4", 1, - Arrays.asList(md5)); - executeTest("test parallelization (4 threads)", spec3); - } - - // -------------------------------------------------------------------------------------------------------------- - // - // testing calls with SLX, 454, and SOLID data - // - // -------------------------------------------------------------------------------------------------------------- - @Test - public void testMultiTechnologies() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommand + - " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" + - " -o %s" + - " -L 1:10,000,000-10,100,000", - 1, - Arrays.asList("c4248f02103e37e89b0f22c0d9c98492")); - - executeTest(String.format("test multiple technologies"), spec); - } - - // -------------------------------------------------------------------------------------------------------------- - // - // testing calls with BAQ - // - // -------------------------------------------------------------------------------------------------------------- - @Test - public void testCallingWithBAQ() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommand + - " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" + - " -o %s" + - " -L 1:10,000,000-10,100,000" + - " -baq CALCULATE_AS_NECESSARY", - 1, - Arrays.asList("96c7862d55e933b274cabe45c9c443d9")); - - executeTest(String.format("test calling with BAQ"), spec); - } - - // -------------------------------------------------------------------------------------------------------------- - // - // testing SnpEff - // - // -------------------------------------------------------------------------------------------------------------- - - @Test - public void testSnpEffAnnotationRequestedWithoutRodBinding() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000 " + - "-A SnpEff", - 1, - UserException.class); - executeTest("testSnpEffAnnotationRequestedWithoutRodBinding", spec); - } - - // -------------------------------------------------------------------------------------------------------------- - // - // testing Ns in CIGAR - // - // -------------------------------------------------------------------------------------------------------------- - - @Test - public void testNsInCigar() { - final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "testWithNs.bam -o %s -L 8:141813600-141813700 -out_mode EMIT_ALL_SITES", 1, - UserException.UnsupportedCigarOperatorException.class); - - executeTest("test calling on reads with Ns in CIGAR", spec); - } - - @Test(enabled = true) - public void testCompressedVCFOutputWithNT() throws Exception { - WalkerTestSpec spec = new WalkerTestSpec("-T UnifiedGenotyper -R " + b37KGReference + " -I " - + privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam" - + " -o %s -L 20:10,000,000-10,100,000 -nt 4", - 1, Arrays.asList("vcf.gz"), Arrays.asList("")); - final File vcf = executeTest("testCompressedVCFOutputWithNT", spec).first.get(0); - final AsciiLineReader reader = new AsciiLineReader(new BlockCompressedInputStream(vcf)); - int nLines = 0; - while ( reader.readLine() != null ) - nLines++; - Assert.assertTrue(nLines > 0); - } - - // -------------------------------------------------------------------------------------------------------------- - // - // testing only emit samples - // - // -------------------------------------------------------------------------------------------------------------- - - @Test(enabled = true) - public void testOnlyEmitSample() throws Exception { - final String base = "-T UnifiedGenotyper -R " + b37KGReference + " -I " - + privateTestDir + "AFR.complex.variants.bam --disableDithering" - + " -o %s -L 20:10,000,000-10,100,000"; - final WalkerTestSpec specAllSamples = new WalkerTestSpec(base, 1, Arrays.asList("")); - specAllSamples.disableShadowBCF(); - final File allSamplesVCF = executeTest("testOnlyEmitSampleAllSamples", specAllSamples).first.get(0); - final List allSampleVCs = GATKVCFUtils.readVCF(allSamplesVCF).getSecond(); - - final WalkerTestSpec onlyHG01879 = new WalkerTestSpec(base + " -onlyEmitSamples HG01879", 1, Arrays.asList("")); - onlyHG01879.disableShadowBCF(); - final File onlyHG01879VCF = executeTest("testOnlyEmitSample", onlyHG01879).first.get(0); - final List onlyHG01879VCs = GATKVCFUtils.readVCF(onlyHG01879VCF).getSecond(); - - Assert.assertEquals(allSampleVCs.size(), onlyHG01879VCs.size()); - for ( int i = 0; i < allSampleVCs.size(); i++ ) { - final VariantContext allSampleVC = allSampleVCs.get(i); - final VariantContext onlyHG01879VC = onlyHG01879VCs.get(i); - - if ( allSampleVC == null ) { - Assert.assertNull(onlyHG01879VC); - } else { - Assert.assertNotNull(onlyHG01879VC); - - Assert.assertTrue(allSampleVC.getGenotypes().size() > 1, "All samples should have had more than 1 genotype, but didn't"); - Assert.assertEquals(onlyHG01879VC.getGenotypes().size(), 1, "Should have found a single sample genotype, but didn't"); - Assert.assertEquals(onlyHG01879VC.hasGenotype("HG01879"), true); - - Assert.assertEquals(allSampleVC.getStart(), onlyHG01879VC.getStart()); - Assert.assertEquals(allSampleVC.getChr(), onlyHG01879VC.getChr()); - Assert.assertEquals(allSampleVC.getEnd(), onlyHG01879VC.getEnd()); - Assert.assertEquals(allSampleVC.getFilters(), onlyHG01879VC.getFilters()); - Assert.assertEquals(allSampleVC.getAlleles(), onlyHG01879VC.getAlleles()); - Assert.assertEquals(allSampleVC.getAttributes(), onlyHG01879VC.getAttributes()); - Assert.assertEquals(allSampleVC.getPhredScaledQual(), onlyHG01879VC.getPhredScaledQual()); - - final Genotype allG = allSampleVC.getGenotype("HG01879"); - final Genotype onlyG = onlyHG01879VC.getGenotype("HG01879"); - Assert.assertEquals(allG.getAD(), onlyG.getAD()); - Assert.assertEquals(allG.getDP(), onlyG.getDP()); - Assert.assertEquals(allG.getAlleles(), onlyG.getAlleles()); - Assert.assertEquals(allG.getPL(), onlyG.getPL()); - Assert.assertEquals(allG.toString(), onlyG.toString()); - } - } - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java deleted file mode 100644 index 01aab8ae3..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java +++ /dev/null @@ -1,126 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; - -import java.util.Arrays; - -public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ - - private final static String baseCommand = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; - - // -------------------------------------------------------------------------------------------------------------- - // - // testing normal calling - // - // -------------------------------------------------------------------------------------------------------------- - @Test - public void testMultiSamplePilot1() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("710d379607129935b1b7b6960ca7b213")); - executeTest("test MultiSample Pilot1", spec); - } - - @Test - public void testWithAllelesPassedIn1() { - WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( - baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("ebfcc3dd8c1788929cb50050c5d456df")); - executeTest("test MultiSample Pilot2 with alleles passed in", spec1); - } - - @Test - public void testWithAllelesPassedIn2() { - WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( - baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("3e646003c5b93da80c7d8e5d0ff2ee4e")); - executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2); - } - - @Test - public void testSingleSamplePilot2() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("02b521fe88a6606a29c12c0885c3debd")); - executeTest("test SingleSample Pilot2", spec); - } - - @Test - public void testMultipleSNPAlleles() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("dd5ad3beaa75319bb2ef1434d2dd9f73")); - executeTest("test Multiple SNP alleles", spec); - } - - @Test - public void testBadRead() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH -I " + privateTestDir + "badRead.test.bam -o %s -L 1:22753424-22753464", 1, - Arrays.asList("d915535c1458733f09f82670092fcab6")); - executeTest("test bad read", spec); - } - - @Test - public void testReverseTrim() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, - Arrays.asList("a973298b2801b80057bea88507e2858d")); - executeTest("test reverse trim", spec); - } - - @Test - public void testMismatchedPLs() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, - Arrays.asList("8d91d98c4e79897690d3c6918b6ac761")); - executeTest("test mismatched PLs", spec); - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java deleted file mode 100644 index 3b5690046..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java +++ /dev/null @@ -1,87 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; - -import java.util.Arrays; - -public class UnifiedGenotyperReducedReadsIntegrationTest extends WalkerTest { - - // -------------------------------------------------------------------------------------------------------------- - // - // testing reduced reads - // - // -------------------------------------------------------------------------------------------------------------- - - @Test - public void testReducedBam() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("ffde0d5e23523e4bd9e7e18f62d37d0f")); - executeTest("test calling on a ReducedRead BAM", spec); - } - - @Test - public void testReducedBamSNPs() { - testReducedCalling("SNP", "e8de8c523751ad2fa2ee20185ba5dea7"); - } - - @Test - public void testReducedBamINDELs() { - testReducedCalling("INDEL", "942930038cf7fc9a80b969461aaa9aa6"); - } - - - private void testReducedCalling(final String model, final String md5) { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.reduced.bam -o %s -L 20:10,000,000-10,500,000 -glm " + model, 1, - Arrays.asList(md5)); - executeTest("test calling on a ReducedRead BAM with " + model, spec); - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java deleted file mode 100644 index 550153be0..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java +++ /dev/null @@ -1,222 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; - -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.variant.variantcontext.Allele; -import org.broadinstitute.variant.variantcontext.Genotype; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.variantcontext.VariantContextBuilder; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.*; - - -// SEE private/R/pls.R if you want the truth output for these tests -public class IndependentAllelesDiploidExactAFCalcUnitTest extends BaseTest { - @DataProvider(name = "TestCombineGLs") - public Object[][] makeTestCombineGLs() { - List tests = new ArrayList(); - - tests.add(new Object[]{1, 1, makePL( 0, 10, 20), makePL( 0, 10, 20)}); - tests.add(new Object[]{1, 1, makePL(10, 0, 20), makePL(10, 0, 20)}); - tests.add(new Object[]{1, 1, makePL(20, 10, 0), makePL(20, 10, 0)}); - - // AA AB BB AC BC CC => AA AB+BC CC - tests.add(new Object[]{1, 2, makePL( 0, 10, 20, 30, 40, 50), makePL(0, 10, 20)}); - tests.add(new Object[]{2, 2, makePL( 0, 10, 20, 30, 40, 50), makePL(0, 30, 50)}); - - tests.add(new Object[]{1, 2, makePL( 0, 10, 10, 10, 10, 10), makePL(0, 8, 11)}); - tests.add(new Object[]{2, 2, makePL( 0, 10, 10, 10, 10, 10), makePL(0, 8, 11)}); - - tests.add(new Object[]{1, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 2, 5)}); - tests.add(new Object[]{2, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 4, 9)}); - - tests.add(new Object[]{1, 2, makePL( 0, 50, 50, 50, 50, 50), makePL( 0, 47, 50)}); - tests.add(new Object[]{2, 2, makePL( 0, 50, 50, 50, 50, 50), makePL( 0, 47, 50)}); - - tests.add(new Object[]{1, 2, makePL( 50, 0, 50, 50, 50, 50), makePL(45, 0, 50)}); - tests.add(new Object[]{2, 2, makePL( 50, 0, 50, 50, 50, 50), makePL( 0, 47, 50)}); - - tests.add(new Object[]{1, 2, makePL( 50, 50, 0, 50, 50, 50), makePL(45, 47, 0)}); - tests.add(new Object[]{2, 2, makePL( 50, 50, 0, 50, 50, 50), makePL( 0, 47, 50)}); - - tests.add(new Object[]{1, 2, makePL( 50, 50, 50, 0, 50, 50), makePL(0, 47, 50)}); - tests.add(new Object[]{2, 2, makePL( 50, 50, 50, 0, 50, 50), makePL(45, 0, 50)}); - - tests.add(new Object[]{1, 2, makePL( 50, 50, 50, 50, 0, 50), makePL(45, 0, 50)}); - tests.add(new Object[]{2, 2, makePL( 50, 50, 50, 50, 0, 50), makePL(45, 0, 50)}); - - tests.add(new Object[]{1, 2, makePL( 50, 50, 50, 50, 50, 0), makePL(0, 47, 50)}); - tests.add(new Object[]{2, 2, makePL( 50, 50, 50, 50, 50, 0), makePL(45, 47, 0)}); - - return tests.toArray(new Object[][]{}); - } - - private Genotype makePL(final int ... PLs) { - return AFCalcUnitTest.makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), PLs); - } - - @Test(enabled = true, dataProvider = "TestCombineGLs") - private void testCombineGLs(final int altIndex, final int nAlts, final Genotype testg, final Genotype expected) { - final IndependentAllelesDiploidExactAFCalc calc = (IndependentAllelesDiploidExactAFCalc)AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 1, 4); - final Genotype combined = calc.combineGLs(testg, altIndex, nAlts); - - Assert.assertEquals(combined.getPL(), expected.getPL(), - "Combined PLs " + Utils.join(",", combined.getPL()) + " != expected " + Utils.join(",", expected.getPL())); - } - - - static Allele A = Allele.create("A", true); - static Allele C = Allele.create("C"); - static Allele G = Allele.create("G"); - - @DataProvider(name = "TestMakeAlleleConditionalContexts") - public Object[][] makeTestMakeAlleleConditionalContexts() { - List tests = new ArrayList(); - - final VariantContextBuilder root = new VariantContextBuilder("x", "1", 1, 1, Arrays.asList(A)); - final VariantContextBuilder vcAC = new VariantContextBuilder(root).alleles(Arrays.asList(A, C)); - final VariantContextBuilder vcAG = new VariantContextBuilder(root).alleles(Arrays.asList(A, G)); - final VariantContextBuilder vcACG = new VariantContextBuilder(root).alleles(Arrays.asList(A, C, G)); - final VariantContextBuilder vcAGC = new VariantContextBuilder(root).alleles(Arrays.asList(A, G, C)); - - final Genotype gACG = makePL( 0, 1, 2, 3, 4, 5); - final Genotype gAGC = makePL( 0, 4, 5, 1, 3, 2); - final Genotype gACcombined = makePL(0, 2, 5); - final Genotype gACcombined2 = makePL(0, 1, 4); - final Genotype gAGcombined = makePL(0, 4, 9); - - // biallelic - tests.add(new Object[]{vcAC.genotypes(gACcombined).make(), Arrays.asList(vcAC.genotypes(gACcombined).make())}); - - // tri-allelic - tests.add(new Object[]{vcACG.genotypes(gACG).make(), Arrays.asList(vcAC.genotypes(gACcombined).make(), vcAG.genotypes(gAGcombined).make())}); - tests.add(new Object[]{vcAGC.genotypes(gAGC).make(), Arrays.asList(vcAG.genotypes(gAGcombined).make(), vcAC.genotypes(gACcombined2).make())}); - - return tests.toArray(new Object[][]{}); - } - - - @Test(enabled = true, dataProvider = "TestMakeAlleleConditionalContexts") - private void testMakeAlleleConditionalContexts(final VariantContext vc, final List expectedVCs) { - final IndependentAllelesDiploidExactAFCalc calc = (IndependentAllelesDiploidExactAFCalc)AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 1, 4); - final List biAllelicVCs = calc.makeAlleleConditionalContexts(vc); - - Assert.assertEquals(biAllelicVCs.size(), expectedVCs.size()); - - for ( int i = 0; i < biAllelicVCs.size(); i++ ) { - final VariantContext actual = biAllelicVCs.get(i); - final VariantContext expected = expectedVCs.get(i); - Assert.assertEquals(actual.getAlleles(), expected.getAlleles()); - - for ( int j = 0; j < actual.getNSamples(); j++ ) - Assert.assertEquals(actual.getGenotype(j).getPL(), expected.getGenotype(j).getPL(), - "expected PLs " + Utils.join(",", expected.getGenotype(j).getPL()) + " not equal to actual " + Utils.join(",", actual.getGenotype(j).getPL())); - } - } - - - @DataProvider(name = "ThetaNTests") - public Object[][] makeThetaNTests() { - List tests = new ArrayList(); - - final List log10LAlleles = Arrays.asList(0.0, -1.0, -2.0, -3.0, -4.0); - - for ( final double log10pRef : Arrays.asList(-1, -2, -3) ) { - for ( final int ploidy : Arrays.asList(1, 2, 3, 4) ) { - for ( List permutations : Utils.makePermutations(log10LAlleles, ploidy, true)) { - tests.add(new Object[]{permutations, Math.pow(10, log10pRef)}); - } - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "ThetaNTests") - public void testThetaNTests(final List log10LAlleles, final double pRef) { - // biallelic - final double[] rawPriors = MathUtils.toLog10(new double[]{pRef, 1-pRef}); - - final double log10pNonRef = Math.log10(1-pRef); - - final List originalPriors = new LinkedList(); - final List pNonRefN = new LinkedList(); - for ( int i = 0; i < log10LAlleles.size(); i++ ) { - final double log10LAllele1 = log10LAlleles.get(i); - final double[] L1 = MathUtils.normalizeFromLog10(new double[]{log10LAllele1, 0.0}, true); - final AFCalcResult result1 = new AFCalcResult(new int[]{1}, 1, Arrays.asList(A, C), L1, rawPriors, Collections.singletonMap(C, -10000.0)); - originalPriors.add(result1); - pNonRefN.add(log10pNonRef*(i+1)); - } - - final IndependentAllelesDiploidExactAFCalc calc = (IndependentAllelesDiploidExactAFCalc)AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 1, 2); - final List thetaNPriors = calc.applyMultiAllelicPriors(originalPriors); - - double prevPosterior = 0.0; - for ( int i = 0; i < log10LAlleles.size(); i++ ) { - final AFCalcResult thetaN = thetaNPriors.get(i); - AFCalcResult orig = null; - for ( final AFCalcResult x : originalPriors ) - if ( x.getAllelesUsedInGenotyping().equals(thetaN.getAllelesUsedInGenotyping())) - orig = x; - - Assert.assertNotNull(orig, "couldn't find original AFCalc"); - - Assert.assertEquals(orig.getLog10PriorOfAFGT0(), log10pNonRef, 1e-6); - Assert.assertEquals(thetaN.getLog10PriorOfAFGT0(), pNonRefN.get(i), 1e-6); - - Assert.assertTrue(orig.getLog10PosteriorOfAFGT0() <= prevPosterior, "AFCalc results should be sorted but " + prevPosterior + " is > original posterior " + orig.getLog10PosteriorOfAFGT0()); - prevPosterior = orig.getLog10PosteriorOfAFGT0(); - } - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResultSetUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResultSetUnitTest.java deleted file mode 100644 index 564a475b0..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResultSetUnitTest.java +++ /dev/null @@ -1,249 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; - -import net.sf.samtools.SAMFileHeader; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.SeqGraph; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.ReadThreadingGraph; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.RandomDNA; -import org.broadinstitute.sting.utils.activeregion.ActiveRegion; -import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.pairhmm.ActiveRegionTestDataSet; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.*; - -/** - * Tests for {@link AssemblyResultSet}. - * - * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> - */ -public class AssemblyResultSetUnitTest extends BaseTest -{ - private GenomeLocParser genomeLocParser; - private SAMFileHeader header; - - @BeforeClass - public void init() { - header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); - } - - - @Test - public void testEmptyResultSet() { - final AssemblyResultSet subject = new AssemblyResultSet(); - - Assert.assertEquals(subject.getHaplotypeList().size(), 0); - Assert.assertEquals(subject.getHaplotypeCount(),0); - Assert.assertEquals(subject.getReferenceHaplotype(),null); - Assert.assertEquals(subject.getFullReferenceWithPadding(),null); - Assert.assertEquals(subject.getPaddedReferenceLoc(),null); - Assert.assertEquals(subject.getRegionForGenotyping(),null); - Assert.assertEquals(subject.getUniqueReadThreadingGraph(10),null); - Assert.assertFalse(subject.hasMultipleKmerSizes()); - } - - @Test - public void testAddReferenceHaplotype() { - - final Haplotype ref = new Haplotype("ACGT".getBytes(),true); - ref.setGenomeLocation(genomeLocParser.createGenomeLoc("chr1",1,ref.length() + 1 )); - final AssemblyResultSet subject = new AssemblyResultSet(); - - Assert.assertTrue(subject.add(ref)); - Assert.assertFalse(subject.add(ref)); - - Assert.assertEquals(subject.getReferenceHaplotype(),ref); - Assert.assertEquals(subject.getHaplotypeCount(),1); - Assert.assertEquals(subject.getHaplotypeList().size(),1); - } - - @Test(dataProvider="assemblyResults") - public void testAddManyHaplotypes(final java.util.List assemblyResults, - final java.util.List> haplotypes) { - final AssemblyResultSet subject = new AssemblyResultSet(); - for (int i = 0; i < haplotypes.size(); i++) { - final int haplotypeCountBefore = subject.getHaplotypeCount(); - final java.util.List haplos = haplotypes.get(i); - final AssemblyResult ar = assemblyResults.get(i); - for (final Haplotype h : haplos) { - Assert.assertTrue(subject.add(h, ar)); - Assert.assertFalse(subject.add(h,ar)); - if (h.isReference()) - Assert.assertEquals(subject.getReferenceHaplotype(),h); - } - final int haplotypeCountAfter = subject.getHaplotypeCount(); - Assert.assertEquals(haplos.size(),haplotypeCountAfter - haplotypeCountBefore); - Assert.assertTrue(subject.getMaximumKmerSize() >= ar.getKmerSize()); - Assert.assertTrue(subject.getMinimumKmerSize() <= ar.getKmerSize()); - Assert.assertEquals(subject.getUniqueReadThreadingGraph(ar.getKmerSize()), ar.getThreadingGraph()); - } - } - - @Test(dataProvider="trimmingData") - public void testTrimTo(final Map haplotypesAndResultSets, final ActiveRegion original) { - final AssemblyResultSet subject = new AssemblyResultSet(); - for (final Map.Entry entry : haplotypesAndResultSets.entrySet()) - subject.add(entry.getKey(),entry.getValue()); - subject.setRegionForGenotyping(original); - final GenomeLoc originalLocation = original.getExtendedLoc(); - final int length = originalLocation.size(); - final GenomeLoc newLocation = originalLocation.setStop(originalLocation.setStart(originalLocation,originalLocation.getStart() + length / 2),originalLocation.getStop() - length / 2); - final ActiveRegion newRegion = original.trim(newLocation); - - final Map originalHaplotypesByTrimmed = new HashMap<>(haplotypesAndResultSets.size()); - for (final Haplotype h : haplotypesAndResultSets.keySet()) - originalHaplotypesByTrimmed.put(h.trim(newRegion.getExtendedLoc()), h); - - final AssemblyResultSet trimmed = subject.trimTo(newRegion, originalHaplotypesByTrimmed); - - Assert.assertFalse(subject.wasTrimmed()); - Assert.assertTrue(trimmed.wasTrimmed()); - - for (final Haplotype h : trimmed.getHaplotypeList()) { - Assert.assertEquals(h.getGenomeLocation(),newLocation); - Assert.assertEquals(h.getBases().length,newLocation.size()); - } - } - - @DataProvider(name="trimmingData") - public Iterator trimmingData() { - final ActiveRegion activeRegion = new ActiveRegion(genomeLocParser.createGenomeLoc("chr1",1000,1100),genomeLocParser,25); - final int length = activeRegion.getExtendedLoc().size(); - final RandomDNA rnd = new RandomDNA(13); // keep it prepoducible by fixing the seed to lucky 13. - final ActiveRegionTestDataSet actd = new ActiveRegionTestDataSet(10,new String(rnd.nextBases(length)),new String[] { - "Civar:*1T*" }, new String[0], new byte[0], new byte[0], new byte[0]); - - final List haplotypes = actd.haplotypeList(); - for (final Haplotype h : haplotypes) - h.setGenomeLocation(activeRegion.getExtendedLoc()); - - final ReadThreadingGraph rtg = new ReadThreadingGraph(10); - for (final Haplotype h : haplotypes) - rtg.addSequence("seq-" + Math.abs(h.hashCode()), h.getBases(), null, h.isReference()); - final SeqGraph seqGraph = rtg.convertToSequenceGraph(); - final AssemblyResult ar = new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION,seqGraph); - ar.setThreadingGraph(rtg); - final Map result = - new HashMap<>(); - for (final Haplotype h : haplotypes) - result.put(h,ar); - return Collections.singleton(new Object[] {result,activeRegion}).iterator(); - - } - - - - - @DataProvider(name="assemblyResults") - public java.util.Iterator assemblyResults() { - final int size = THREE_KS_GRAPH_AND_HAPLOTYPES.length * (1 + TEN_KS_GRAPH_AND_HAPLOTYPES.length); - final Object[][] result = new Object[size][]; - - for (int i = 0; i < THREE_KS_GRAPH_AND_HAPLOTYPES.length; i++) { - final ReadThreadingGraph rtg = new ReadThreadingGraph((String) THREE_KS_GRAPH_AND_HAPLOTYPES[i][0]); - final AssemblyResult ar = new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION,rtg.convertToSequenceGraph()); - ar.setThreadingGraph(rtg); - final Object[] haplotypeStrings = (Object[]) THREE_KS_GRAPH_AND_HAPLOTYPES[i][1]; - final Haplotype[] haplotypes = new Haplotype[haplotypeStrings.length]; - for (int j = 0; j < haplotypeStrings.length; j++) { - haplotypes[j] = new Haplotype(((String)haplotypeStrings[j]).getBytes(),j == 0); - haplotypes[j].setGenomeLocation(genomeLocParser.createGenomeLoc("chr1",1,haplotypes[j].length() + 1)); - } - result[i] = new Object[] { Collections.singletonList(ar),Arrays.asList(Arrays.asList(haplotypes))}; - for (int j = 0; j < TEN_KS_GRAPH_AND_HAPLOTYPES.length; j++) { - final ReadThreadingGraph rtg10 = new ReadThreadingGraph((String) TEN_KS_GRAPH_AND_HAPLOTYPES[j][0]); - final AssemblyResult ar10 = new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION,rtg10.convertToSequenceGraph()); - ar10.setThreadingGraph(rtg10); - final Object[] haplotypeStrings10 = (Object[]) TEN_KS_GRAPH_AND_HAPLOTYPES[j][1]; - final Haplotype[] haplotype10 = new Haplotype[haplotypeStrings10.length]; - for (int k = 0; k < haplotypeStrings10.length; k++) { - haplotype10[k] = new Haplotype(((String)haplotypeStrings10[k]).getBytes(),false); - haplotype10[k].setGenomeLocation(genomeLocParser.createGenomeLoc("chr1", 1, haplotype10[k].length() + 1)); - } - - result[THREE_KS_GRAPH_AND_HAPLOTYPES.length + i * TEN_KS_GRAPH_AND_HAPLOTYPES.length + j] = new Object[] { Arrays.asList(ar,ar10), - Arrays.asList( Arrays.asList(haplotypes), Arrays.asList(haplotype10)) }; - } - } - return Arrays.asList(result).iterator(); - } - - - private static final Object[][] THREE_KS_GRAPH_AND_HAPLOTYPES = new Object[][] { - {"[ks=3]{REF: ACT}",new Object[] {"ACT"}}, - {"[ks=3]{REF: ACT(3) -> T(1) -> G(2) -> A}" + - "{ (3) -> A -> G -> (2) }" + - "{ (1) -> A -> G -> (2) }",new Object[] {"ACTTGA","ACTAGGA","ACTTAGGA"}}, - {"[ks=3]{REF: ACT -> C(1) -> G}{ACT -> C(1) -> G}{ACT -> C(1) -> G}", new Object[] {"ACTCG"}} , - {"[ks=3]{REF: ACT -> A(1) -> G -> A(2) -> C -> G -> T }" + - "{A(1) -> T -> A(2) }", new Object[] {"ACTAGACGT","ACTATACGT"}} , - {"[ks=3]{REF: ACT -> A -> T(2) -> C -> A -> G -> T -> A -> C -> G -> T -> A(1) -> T}" + - "{ ACT -> A -> T(2) -> C -> T -> A -> C -> G -> T -> A(1) -> T}", - new Object[] {"ACTATCAGTACGTAT","ACTATCTACGTAT"}} , - {"[ks=3]{REF: ACT -> A -> T -> C -> A -> G -> T -> A -> C -> G -> T -> A -> T}", - new Object[] {"ACTATCAGTACGTAT"}}, - {"[ks=3]{REF: ACT -> A -> T(1) }" + - "{ ACT -> A -> T(1) }", new Object[] {"ACTAT"}}, - {"[ks=3]{REF: TTT -> A(1) -> C -> T(2)}{ A(1) -> T(2) } ", new Object[] {"TTTACT","TTTAT"}} - }; - - private static final Object[][] TEN_KS_GRAPH_AND_HAPLOTYPES = new Object[][] { - {"[ks=10]{ACTAGTAAAT -> A -> T -> A -> A -> T -> A", new Object[] {"ACTAGTAAATATAATA"}}, - {"[ks=10]{ATAGTAATAA(1) -> A -> C -> T -> A(2) -> C}{ (1) -> C -> C -> C -> A(2) -> C}", - new Object[] {"ATAGTAATAAACTAC","ATAGTAATAACCCAC"}}, - - }; - -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java deleted file mode 100644 index 8633a1d9d..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java +++ /dev/null @@ -1,305 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: 3/15/12 - */ - -import net.sf.picard.reference.ReferenceSequenceFile; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment; -import org.broadinstitute.variant.variantcontext.Allele; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.*; - -/** - * Unit tests for GenotypingEngine - */ -public class GenotypingEngineUnitTest extends BaseTest { - - private static ReferenceSequenceFile seq; - private GenomeLocParser genomeLocParser; - - @BeforeClass - public void init() throws FileNotFoundException { - // sequence - seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); - genomeLocParser = new GenomeLocParser(seq); - } - - @Test - public void testFindHomVarEventAllelesInSample() { - final List eventAlleles = new ArrayList(); - eventAlleles.add( Allele.create("A", true) ); - eventAlleles.add( Allele.create("C", false) ); - final List haplotypeAlleles = new ArrayList(); - haplotypeAlleles.add( Allele.create("AATA", true) ); - haplotypeAlleles.add( Allele.create("AACA", false) ); - haplotypeAlleles.add( Allele.create("CATA", false) ); - haplotypeAlleles.add( Allele.create("CACA", false) ); - final List haplotypes = new ArrayList(); - haplotypes.add(new Haplotype("AATA".getBytes())); - haplotypes.add(new Haplotype("AACA".getBytes())); - haplotypes.add(new Haplotype("CATA".getBytes())); - haplotypes.add(new Haplotype("CACA".getBytes())); - final List haplotypeAllelesForSample = new ArrayList(); - haplotypeAllelesForSample.add( Allele.create("CATA", false) ); - haplotypeAllelesForSample.add( Allele.create("CACA", false) ); - final List> alleleMapper = new ArrayList>(); - List Aallele = new ArrayList(); - Aallele.add(haplotypes.get(0)); - Aallele.add(haplotypes.get(1)); - List Callele = new ArrayList(); - Callele.add(haplotypes.get(2)); - Callele.add(haplotypes.get(3)); - alleleMapper.add(Aallele); - alleleMapper.add(Callele); - final List eventAllelesForSample = new ArrayList(); - eventAllelesForSample.add( Allele.create("C", false) ); - eventAllelesForSample.add( Allele.create("C", false) ); - - if(!compareAlleleLists(eventAllelesForSample, GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes))) { - logger.warn("calc alleles = " + GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes)); - logger.warn("expected alleles = " + eventAllelesForSample); - } - Assert.assertTrue(compareAlleleLists(eventAllelesForSample, GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes))); - } - - @Test - public void testFindHetEventAllelesInSample() { - final List eventAlleles = new ArrayList(); - eventAlleles.add( Allele.create("A", true) ); - eventAlleles.add( Allele.create("C", false) ); - eventAlleles.add( Allele.create("T", false) ); - final List haplotypeAlleles = new ArrayList(); - haplotypeAlleles.add( Allele.create("AATA", true) ); - haplotypeAlleles.add( Allele.create("AACA", false) ); - haplotypeAlleles.add( Allele.create("CATA", false) ); - haplotypeAlleles.add( Allele.create("CACA", false) ); - haplotypeAlleles.add( Allele.create("TACA", false) ); - haplotypeAlleles.add( Allele.create("TTCA", false) ); - haplotypeAlleles.add( Allele.create("TTTA", false) ); - final List haplotypes = new ArrayList(); - haplotypes.add(new Haplotype("AATA".getBytes())); - haplotypes.add(new Haplotype("AACA".getBytes())); - haplotypes.add(new Haplotype("CATA".getBytes())); - haplotypes.add(new Haplotype("CACA".getBytes())); - haplotypes.add(new Haplotype("TACA".getBytes())); - haplotypes.add(new Haplotype("TTCA".getBytes())); - haplotypes.add(new Haplotype("TTTA".getBytes())); - final List haplotypeAllelesForSample = new ArrayList(); - haplotypeAllelesForSample.add( Allele.create("TTTA", false) ); - haplotypeAllelesForSample.add( Allele.create("AATA", true) ); - final List> alleleMapper = new ArrayList>(); - List Aallele = new ArrayList(); - Aallele.add(haplotypes.get(0)); - Aallele.add(haplotypes.get(1)); - List Callele = new ArrayList(); - Callele.add(haplotypes.get(2)); - Callele.add(haplotypes.get(3)); - List Tallele = new ArrayList(); - Tallele.add(haplotypes.get(4)); - Tallele.add(haplotypes.get(5)); - Tallele.add(haplotypes.get(6)); - alleleMapper.add(Aallele); - alleleMapper.add(Callele); - alleleMapper.add(Tallele); - final List eventAllelesForSample = new ArrayList(); - eventAllelesForSample.add( Allele.create("A", true) ); - eventAllelesForSample.add( Allele.create("T", false) ); - - if(!compareAlleleLists(eventAllelesForSample, GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes))) { - logger.warn("calc alleles = " + GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes)); - logger.warn("expected alleles = " + eventAllelesForSample); - } - Assert.assertTrue(compareAlleleLists(eventAllelesForSample, GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes))); - } - - private boolean compareAlleleLists(List l1, List l2) { - if( l1.size() != l2.size() ) { - return false; // sanity check - } - - for( int i=0; i < l1.size(); i++ ){ - if ( !l2.contains(l1.get(i)) ) - return false; - } - return true; - } - - - private class BasicGenotypingTestProvider extends TestDataProvider { - byte[] ref; - byte[] hap; - Map expected; - - public BasicGenotypingTestProvider(String refString, String hapString, Map expected) { - super(BasicGenotypingTestProvider.class, String.format("Haplotype to VCF test: ref = %s, alignment = %s", refString,hapString)); - ref = refString.getBytes(); - hap = hapString.getBytes(); - this.expected = expected; - } - - public Map calcAlignment() { - final SWPairwiseAlignment alignment = new SWPairwiseAlignment(ref, hap); - final Haplotype h = new Haplotype(hap, false, alignment.getAlignmentStart2wrt1(), alignment.getCigar()); - return GenotypingEngine.generateVCsFromAlignment( h, ref, genomeLocParser.createGenomeLoc("4",1,1+ref.length), "name"); - } - } - - @DataProvider(name = "BasicGenotypingTestProvider") - public Object[][] makeBasicGenotypingTests() { - - for( int contextSize : new int[]{0,1,5,9,24,36} ) { - Map map = new HashMap(); - map.put(1 + contextSize, (byte)'M'); - final String context = Utils.dupString('G', contextSize); - new BasicGenotypingTestProvider(context + "AGCTCGCATCGCGAGCATCGACTAGCCGATAG" + context, "CGCTCGCATCGCGAGCATCGACTAGCCGATAG", map); - } - - for( int contextSize : new int[]{0,1,5,9,24,36} ) { - Map map = new HashMap(); - map.put(2 + contextSize, (byte)'M'); - map.put(21 + contextSize, (byte)'M'); - final String context = Utils.dupString('G', contextSize); - new BasicGenotypingTestProvider(context + "AGCTCGCATCGCGAGCATCGACTAGCCGATAG", "ATCTCGCATCGCGAGCATCGCCTAGCCGATAG", map); - } - - for( int contextSize : new int[]{0,1,5,9,24,36} ) { - Map map = new HashMap(); - map.put(1 + contextSize, (byte)'M'); - map.put(20 + contextSize, (byte)'I'); - final String context = Utils.dupString('G', contextSize); - new BasicGenotypingTestProvider(context + "AGCTCGCATCGCGAGCATCGACTAGCCGATAG" + context, "CGCTCGCATCGCGAGCATCGACACTAGCCGATAG", map); - } - - for( int contextSize : new int[]{0,1,5,9,24,36} ) { - Map map = new HashMap(); - map.put(1 + contextSize, (byte)'M'); - map.put(20 + contextSize, (byte)'D'); - final String context = Utils.dupString('G', contextSize); - new BasicGenotypingTestProvider(context + "AGCTCGCATCGCGAGCATCGACTAGCCGATAG" + context, "CGCTCGCATCGCGAGCATCGCTAGCCGATAG", map); - } - - for( int contextSize : new int[]{1,5,9,24,36} ) { - Map map = new HashMap(); - map.put(1, (byte)'M'); - map.put(20, (byte)'D'); - final String context = Utils.dupString('G', contextSize); - new BasicGenotypingTestProvider("AGCTCGCATCGCGAGCATCGACTAGCCGATAG" + context, "CGCTCGCATCGCGAGCATCGCTAGCCGATAG", map); - } - - for( int contextSize : new int[]{0,1,5,9,24,36} ) { - Map map = new HashMap(); - map.put(2 + contextSize, (byte)'M'); - map.put(20 + contextSize, (byte)'I'); - map.put(30 + contextSize, (byte)'D'); - final String context = Utils.dupString('G', contextSize); - new BasicGenotypingTestProvider(context + "AGCTCGCATCGCGAGCATCGACTAGCCGATAG" + context, "ACCTCGCATCGCGAGCATCGTTACTAGCCGATG", map); - } - - for( int contextSize : new int[]{0,1,5,9,24,36} ) { - Map map = new HashMap(); - map.put(1 + contextSize, (byte)'M'); - map.put(20 + contextSize, (byte)'D'); - map.put(28 + contextSize, (byte)'M'); - final String context = Utils.dupString('G', contextSize); - new BasicGenotypingTestProvider(context + "AGCTCGCATCGCGAGCATCGACTAGCCGATAG" + context, "CGCTCGCATCGCGAGCATCGCTAGCCCATAG", map); - } - - return BasicGenotypingTestProvider.getTests(BasicGenotypingTestProvider.class); - } - - @Test(dataProvider = "BasicGenotypingTestProvider", enabled = true) - public void testHaplotypeToVCF(BasicGenotypingTestProvider cfg) { - Map calculatedMap = cfg.calcAlignment(); - Map expectedMap = cfg.expected; - logger.warn(String.format("Test: %s", cfg.toString())); - if(!compareVCMaps(calculatedMap, expectedMap)) { - logger.warn("calc map = " + calculatedMap); - logger.warn("expected map = " + expectedMap); - } - Assert.assertTrue(compareVCMaps(calculatedMap, expectedMap)); - } - - /** - * Private function to compare Map of VCs, it only checks the types and start locations of the VariantContext - */ - private boolean compareVCMaps(Map calc, Map expected) { - if( !calc.keySet().equals(expected.keySet()) ) { return false; } // sanity check - for( Integer loc : expected.keySet() ) { - Byte type = expected.get(loc); - switch( type ) { - case 'I': - if( !calc.get(loc).isSimpleInsertion() ) { return false; } - break; - case 'D': - if( !calc.get(loc).isSimpleDeletion() ) { return false; } - break; - case 'M': - if( !(calc.get(loc).isMNP() || calc.get(loc).isSNP()) ) { return false; } - break; - default: - return false; - } - } - return true; - } -} \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java deleted file mode 100644 index 3907ffbd6..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ /dev/null @@ -1,99 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; - -import java.util.Arrays; - -import static org.broadinstitute.sting.gatk.walkers.haplotypecaller.HaplotypeCallerIntegrationTest.NA12878_CHR20_BAM; -import static org.broadinstitute.sting.gatk.walkers.haplotypecaller.HaplotypeCallerIntegrationTest.REF; - -public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends WalkerTest { - - private void HCTestComplexVariants(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:10028767-10028967 -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 4"; - final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(base + " " + args, Arrays.asList(md5)); - executeTest("testHaplotypeCallerComplexVariants: args=" + args, spec); - } - - @Test - public void testHaplotypeCallerMultiSampleComplex1() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "88c10027c21712b1fe475c06cadd503c"); - } - - private void HCTestSymbolicVariants(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:5947969-5948369 -L 20:61091236-61091636 --no_cmdline_in_header -o %s -minPruning 1"; - final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(base + " " + args, Arrays.asList(md5)); - executeTest("testHaplotypeCallerSymbolicVariants: args=" + args, spec); - } - - // TODO -- need a better symbolic allele test - @Test - public void testHaplotypeCallerSingleSampleSymbolic() { - HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "e746a38765298acd716194aee4d93554"); - } - - private void HCTestComplexGGA(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " --no_cmdline_in_header -o %s -minPruning 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf"; - final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); - executeTest("testHaplotypeCallerComplexGGA: args=" + args, spec); - } - - @Test - public void testHaplotypeCallerMultiSampleGGAComplex() { - HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538", - "b787be740423b950f8529ccc838fabdd"); - } - - @Test - public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { - HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "f74d68cbc1ecb66a7128258e111cd030"); - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java deleted file mode 100644 index 97744f126..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java +++ /dev/null @@ -1,131 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; - -import org.broadinstitute.sting.WalkerTest; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variant.GATKVCFIndexType; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { - @DataProvider(name = "MyDataProvider") - public Object[][] makeMyDataProvider() { - List tests = new ArrayList<>(); - - final String PCRFreeIntervals = "-L 20:10,000,000-10,010,000"; - final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals; - - // this functionality can be adapted to provide input data for whatever you might want in your data - tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.NONE, PCRFreeIntervals, "3ce9c42e7e97a45a82315523dbd77fcf"}); - tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "c5a55196e10680a02c833a8a44733306"}); - tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "9b9923ef41bfc7346c905fdecf918f92"}); - tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.NONE, WExIntervals, "7cb1e431119df00ec243a6a115fa74b8"}); - tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "90e22230149e6c32d1115d0e2f03cab1"}); - tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.GVCF, WExIntervals, "b39a4bc19a0acfbade22a011cd229262"}); - - - return tests.toArray(new Object[][]{}); - } - - /** - * Example testng test using MyDataProvider - */ - @Test(dataProvider = "MyDataProvider") - public void testHCWithGVCF(String bam, HaplotypeCaller.ReferenceConfidenceMode mode, String intervals, String md5) { - final String commandLine = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s %s -ERC %s --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d", - b37KGReference, bam, intervals, mode, HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER); - final String name = "testHCWithGVCF bam=" + bam + " intervals= " + intervals + " gvcf= " + mode; - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList(md5)); - executeTest(name, spec); - } - - @Test - public void testERCRegionWithNoCalledHaplotypes() { - final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF -variant_index_type %s -variant_index_parameter %d", - b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001", HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("")); - spec.disableShadowBCF(); - executeTest("testERCRegionWithNoCalledHaplotypes", spec); - } - - @Test() - public void testMissingGVCFIndexException() { - final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF", - b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001"); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", 1, UserException.GVCFIndexException.class); - spec.disableShadowBCF(); - executeTest("testMissingGVCFIndexingStrategyException", spec); - } - - @Test() - public void testWrongParameterGVCFIndexException() { - final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF -variant_index_type %s -variant_index_parameter %d", - b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001", HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER + 1); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", 1, UserException.GVCFIndexException.class); - spec.disableShadowBCF(); - executeTest("testMissingGVCFIndexingStrategyException", spec); - } - - @Test() - public void testWrongTypeGVCFIndexException() { - // ensure non-optimal, if optimal changes - GATKVCFIndexType type = GATKVCFIndexType.DYNAMIC_SEEK; - if (HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE == GATKVCFIndexType.DYNAMIC_SEEK) - type = GATKVCFIndexType.DYNAMIC_SIZE; - - final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF -variant_index_type %s -variant_index_parameter %d", - b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001", type, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", 1, UserException.GVCFIndexException.class); - spec.disableShadowBCF(); - executeTest("testMissingGVCFIndexingStrategyException", spec); - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java deleted file mode 100644 index dfbbd7084..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ /dev/null @@ -1,307 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; - -import net.sf.picard.reference.IndexedFastaSequenceFile; -import org.broad.tribble.readers.LineIterator; -import org.broad.tribble.readers.PositionalBufferedStream; -import org.broadinstitute.sting.WalkerTest; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.variant.GATKVCFUtils; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.vcf.VCFCodec; -import org.testng.Assert; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.*; - -public class HaplotypeCallerIntegrationTest extends WalkerTest { - final static String REF = b37KGReference; - final static String NA12878_BAM = privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam"; - final static String NA12878_CHR20_BAM = validationDataLocation + "NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam"; - final static String CEUTRIO_BAM = validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam"; - final static String NA12878_RECALIBRATED_BAM = privateTestDir + "NA12878.100kb.BQSRv2.example.bam"; - final static String NA12878_PCRFREE = privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam"; - final static String NA12878_PCRFREE250_ADAPTER_TRIMMED = privateTestDir + "PCRFree.2x250.b37_decoy.NA12878.adapter_trimmed-10000000-11000000.bam"; - final static String CEUTRIO_MT_TEST_BAM = privateTestDir + "CEUTrio.HiSeq.b37.MT.1_50.bam"; - final static String INTERVALS_FILE = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals"; - - private void HCTest(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R %s -I %s -L %s", REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3"; - final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); - executeTest("testHaplotypeCaller: args=" + args, spec); - } - - @Test - public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "c0b1b64c6005cd3640ffde5dbc10174b"); - } - - @Test - public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "439ce9024f04aad08eab1526d887e295"); - } - - @Test - public void testHaplotypeCallerGraphBasedSingleSample() { - HCTest(NA12878_BAM, "-likelihoodEngine GraphBased", "213df0bdaa78a695e9336128333e4407"); - } - - @Test - public void testHaplotypeCallerGraphBasedMultiSample() { - HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased", "ceee711cac50b4bb66a084acb9264941"); - } - - @Test(enabled = false) // can't annotate the rsID's yet - public void testHaplotypeCallerSingleSampleWithDbsnp() { - HCTest(NA12878_BAM, "-D " + b37dbSNP132, ""); - } - - @Test - public void testHaplotypeCallerMultiSampleGGA() { - HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "b09437f11db40abd49195110e50692c2"); - } - - @Test - public void testHaplotypeCallerInsertionOnEdgeOfContig() { - HCTest(CEUTRIO_MT_TEST_BAM, "-L MT:1-10", "7f1fb8f9587f64643f6612ef1dd6d4ae"); - } - - private void HCTestIndelQualityScores(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:10,005,000-10,025,000 --no_cmdline_in_header -o %s -minPruning 2"; - final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); - executeTest("testHaplotypeCallerIndelQualityScores: args=" + args, spec); - } - - @Test - public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "c57c463542304fb7b2576e531faca89e"); - } - - private void HCTestNearbySmallIntervals(String bam, String args, String md5) { - try { - final IndexedFastaSequenceFile fasta = new IndexedFastaSequenceFile(new File(b37KGReference)); - final GenomeLocParser parser = new GenomeLocParser(fasta.getSequenceDictionary()); - - final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:10,001,603-10,001,642 -L 20:10,001,653-10,001,742 --no_cmdline_in_header -o %s"; - final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); - for( final File vcf : executeTest("testHaplotypeCallerNearbySmallIntervals: args=" + args, spec).getFirst() ) { - if( containsDuplicateRecord(vcf, parser) ) { - throw new IllegalStateException("Duplicate records detected but there should be none."); - } - } - } catch( FileNotFoundException e ) { - throw new IllegalStateException("Could not find the b37 reference file."); - } - } - - private boolean containsDuplicateRecord( final File vcf, final GenomeLocParser parser ) { - final List> VCs = new ArrayList<>(); - try { - for( final VariantContext vc : GATKVCFUtils.readVCF(vcf).getSecond() ) { - VCs.add(new Pair<>(parser.createGenomeLoc(vc), new GenotypingEngine.Event(vc))); - } - } catch( IOException e ) { - throw new IllegalStateException("Somehow the temporary VCF from the integration test could not be read."); - } - - final Set> VCsAsSet = new HashSet<>(VCs); - return VCsAsSet.size() != VCs.size(); // The set will remove duplicate Events. - } - - - @Test - public void testHaplotypeCallerNearbySmallIntervals() { - HCTestNearbySmallIntervals(NA12878_BAM, "", "75820a4558a559b3e1636fdd1b776ea2"); - } - - // This problem bam came from a user on the forum and it spotted a problem where the ReadClipper - // was modifying the GATKSamRecord and that was screwing up the traversal engine from map call to - // map call. So the test is there for consistency but not for correctness. I'm not sure we can trust - // any of the calls in that region because it is so messy. - @Test - public void HCTestProblematicReadsModifiedInActiveRegions() { - final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("976463812534ac164a64c5d0c3ec988a")); - executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); - } - - @Test - public void HCTestStructuralIndels() { - final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("91717e5e271742c2c9b67223e58f1320")); - executeTest("HCTestStructuralIndels: ", spec); - } - - @Test - public void HCTestDoesNotFailOnBadRefBase() { - // don't care about the output - just want to make sure it doesn't fail - final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "NA12878.readsOverBadBase.chr3.bam") + " --no_cmdline_in_header -o /dev/null -L 3:60830000-60840000 --minPruning 3 -stand_call_conf 2 -stand_emit_conf 2"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Collections.emptyList()); - executeTest("HCTestDoesNotFailOnBadRefBase: ", spec); - } - - @Test - public void HCTestDanglingTailMergingForDeletions() throws IOException { - final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, NA12878_BAM) + " --no_cmdline_in_header -o %s -L 20:10130740-10130800"; - final WalkerTestSpec spec = new WalkerTestSpec(base, 1, Arrays.asList("")); - final File outputVCF = executeTest("HCTestDanglingTailMergingForDeletions", spec).getFirst().get(0); - - // confirm that the call is the correct one - final VCFCodec codec = new VCFCodec(); - final FileInputStream s = new FileInputStream(outputVCF); - final LineIterator lineIterator = codec.makeSourceFromStream(new PositionalBufferedStream(s)); - codec.readHeader(lineIterator); - final String line = lineIterator.next(); - Assert.assertFalse(line == null); - final VariantContext vc = codec.decode(line); - Assert.assertTrue(vc.isBiallelic()); - Assert.assertTrue(vc.getReference().basesMatch("ATGTATG")); - Assert.assertTrue(vc.getAlternateAllele(0).basesMatch("A")); - } - - - // -------------------------------------------------------------------------------------------------------------- - // - // testing reduced reads - // - // -------------------------------------------------------------------------------------------------------------- - - @Test - public void HCTestReducedBam() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("277aa95b01fa4d4e0086a2fabf7f3d7e")); - executeTest("HC calling on a ReducedRead BAM", spec); - } - - @Test - public void testReducedBamWithReadsNotFullySpanningDeletion() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, - Arrays.asList("6a9222905c740b9208bf3c67478514eb")); - executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); - } - - // -------------------------------------------------------------------------------------------------------------- - // - // test dbSNP annotation - // - // -------------------------------------------------------------------------------------------------------------- - - @Test - public void HCTestDBSNPAnnotationWGS() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1, - Arrays.asList("a43d6226a51eb525f0774f88e3778189")); - executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec); - } - - @Test - public void HCTestDBSNPAnnotationWEx() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132 - + " -L " + hg19Intervals + " -isr INTERSECTION", 1, - Arrays.asList("1352cbe1404aefc94eb8e044539a9882")); - executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec); - } - - @Test - public void HCTestDBSNPAnnotationWGSGraphBased() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1, - Arrays.asList("a6c4d5d2eece2bd2c51a81e34e80040f")); - executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec); - } - - @Test - public void HCTestDBSNPAnnotationWExGraphBased() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132 - + " -L " + hg19Intervals + " -isr INTERSECTION", 1, - Arrays.asList("69db1045b5445a4f90843f368bd62814")); - executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec); - } - - @Test - public void HCTestGraphBasedPCRFreePositiveLogLkFix() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + hg19Reference + " --no_cmdline_in_header -I " + NA12878_PCRFREE250_ADAPTER_TRIMMED + " -o %s -L 20:10,000,000-11,000,000 " - , 1, - Arrays.asList("")); - executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec); - } - - // -------------------------------------------------------------------------------------------------------------- - // - // test PCR indel model - // - // -------------------------------------------------------------------------------------------------------------- - - @Test - public void HCTestAggressivePcrIndelModelWGS() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T HaplotypeCaller --disableDithering --pcr_indel_model AGGRESSIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1, - Arrays.asList("19c2992541ede7407192660fdc1fadbf")); - executeTest("HC calling with aggressive indel error modeling on WGS intervals", spec); - } - - @Test - public void HCTestConservativePcrIndelModelWGS() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T HaplotypeCaller --disableDithering --pcr_indel_model CONSERVATIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1, - Arrays.asList("f4ab037915db3a40ba26e9ee30d40e16")); - executeTest("HC calling with conservative indel error modeling on WGS intervals", spec); - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java deleted file mode 100644 index 21648b2b9..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java +++ /dev/null @@ -1,79 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; - -import org.broadinstitute.sting.WalkerTest; -import org.broadinstitute.sting.utils.haplotypeBAMWriter.HaplotypeBAMWriter; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -public class HaplotypeCallerParallelIntegrationTest extends WalkerTest { - @DataProvider(name = "NCTDataProvider") - public Object[][] makeNCTDataProvider() { - List tests = new ArrayList<>(); - - for ( final int nct : Arrays.asList(1, 2, 4) ) { - tests.add(new Object[]{nct, "29cb04cca87f42b4762c34dfea5d15b7"}); - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "NCTDataProvider") - public void testHCNCT(final int nct, final String md5) { - WalkerTestSpec spec = new WalkerTestSpec( - "-T HaplotypeCaller --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " - + privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam -o %s " + - " -L 20:10,000,000-10,100,000 -G none -A -contamination 0.0 -nct " + nct, 1, - Arrays.asList(md5)); - executeTest("HC test parallel HC with NCT with nct " + nct, spec); - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java deleted file mode 100644 index d163c0497..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java +++ /dev/null @@ -1,426 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; - -import net.sf.samtools.SAMFileHeader; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.activeregion.ActiveRegion; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.variantcontext.Genotype; -import org.broadinstitute.variant.variantcontext.GenotypeLikelihoods; -import org.broadinstitute.variant.variantcontext.GenotypeType; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.BeforeMethod; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.*; - -public class ReferenceConfidenceModelUnitTest extends BaseTest { - GenomeLocParser parser; - final String RGID = "ID1"; - GATKSAMReadGroupRecord rg; - final String sample = "NA12878"; - final Set samples = Collections.singleton(sample); - SAMFileHeader header; - ReferenceConfidenceModel model; - - @BeforeClass - public void setUp() throws Exception { - header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); - rg = new GATKSAMReadGroupRecord(RGID); - rg.setSample(sample); - header.addReadGroup(rg); - parser = new GenomeLocParser(header.getSequenceDictionary()); - } - - @BeforeMethod - public void setupModel() { - model = new ReferenceConfidenceModel(parser, samples, header, 10); - } - - @DataProvider(name = "CalcNIndelInformativeReadsData") - public Object[][] makeMyDataProvider() { - List tests = new ArrayList<>(); - - { // very basic testing - final String ref = "ACGT"; - final String read = "ACGT"; - tests.add(new Object[]{read, ref, 1, Arrays.asList(1, 1, 1, 0)}); - tests.add(new Object[]{read, ref, 2, Arrays.asList(1, 1, 0, 0)}); - tests.add(new Object[]{read, ref, 3, Arrays.asList(1, 0, 0, 0)}); - tests.add(new Object[]{read, ref, 4, Arrays.asList(0, 0, 0, 0)}); - } - - { // actually interesting case where some sites aren't informative - final String ref = "NNAAAANN"; - final String read1 = "NNA"; - final String read2 = "NNAA"; - final String read3 = "NNAAA"; - final String read4 = "NNAAAA"; - final String read5 = "NNAAAAN"; - tests.add(new Object[]{read1, ref, 1, Arrays.asList(1, 1, 0, 0, 0, 0, 0, 0)}); - tests.add(new Object[]{read2, ref, 1, Arrays.asList(1, 1, 0, 0, 0, 0, 0, 0)}); - tests.add(new Object[]{read3, ref, 1, Arrays.asList(1, 1, 0, 0, 0, 0, 0, 0)}); - tests.add(new Object[]{read4, ref, 1, Arrays.asList(1, 1, 0, 0, 0, 0, 0, 0)}); - tests.add(new Object[]{read5, ref, 1, Arrays.asList(1, 1, 1, 1, 1, 1, 0, 0)}); - } - - { - for ( final String repeatUnit : Arrays.asList("A", "CA", "TAG", "TAGC", "TCAGA")) { - final String anchor = Utils.dupString("N", repeatUnit.length()); - for ( int nUnits = 1; nUnits < 10; nUnits++ ) { - final String repeat = Utils.dupString(repeatUnit, nUnits); - final String ref = anchor + repeat + anchor; - for ( int readLen = repeatUnit.length(); readLen < repeat.length(); readLen++ ) { - final String read = anchor + repeat.substring(0, readLen); - final List expected = new LinkedList<>(); - for ( int i = 0; i < anchor.length(); i++ ) expected.add(1); - for ( int i = 0; i < repeat.length(); i++ ) expected.add(readLen == repeat.length() ? 1 : 0); - for ( int i = 0; i < anchor.length(); i++ ) expected.add(0); - tests.add(new Object[]{read, ref, repeatUnit.length(), expected}); - - final List result = new ArrayList<>(Collections.nCopies(ref.length() - anchor.length(), 1)); - result.addAll(Collections.nCopies(anchor.length(), 0)); - tests.add(new Object[]{ref, ref, repeatUnit.length(), result}); - } - } - - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "CalcNIndelInformativeReadsData") - public void testCalcNIndelInformativeReads(final String readBases, final String ref, final int maxIndelSize, final List expected ) { - final byte qual = (byte)30; - final byte[] quals = Utils.dupBytes(qual, readBases.length()); - - for ( int i = 0; i < readBases.getBytes().length; i++ ) { - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(readBases.getBytes(), quals, readBases.length() + "M"); - final GenomeLoc loc = new UnvalidatingGenomeLoc("20", 0, i, i); - final ReadBackedPileup pileup = new ReadBackedPileupImpl(loc, Collections.singletonList(read), i); - final int actual = model.calcNIndelInformativeReads(pileup, i, ref.getBytes(), maxIndelSize); - Assert.assertEquals(actual, (int)expected.get(i), "failed at position " + i); - } - } - - @Test - public void testCalcNIndelInformativeReducedReads() { - final String bases = "ACGGGTTTGGAC"; - final byte[] quals = Utils.dupBytes((byte)30, bases.length()); - final int count = 10; - final int[] counts = new int[bases.length()]; - for ( int i = 0; i < counts.length; i++ ) - counts[i] = count; - final int position = 100; - - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialReducedRead(header, "foo", 0, position, counts.length, counts); - read.setReadString(bases); - read.setBaseQualities(quals); - read.setCigarString(bases.length() + "M"); - final GenomeLoc loc = new UnvalidatingGenomeLoc("20", 0, position, position); - final ReadBackedPileup pileup = new ReadBackedPileupImpl(loc, Collections.singletonList(read), 0); - final int actual = model.calcNIndelInformativeReads(pileup, 0, bases.getBytes(), 3); - Assert.assertEquals(actual, count); - } - - @Test - public void testClose() { - model.close(); - } - - @Test - public void testWorstGL() { - final GenotypeLikelihoods gq10 = GenotypeLikelihoods.fromPLField("0,10,100"); - final GenotypeLikelihoods gq20 = GenotypeLikelihoods.fromPLField("0,20,200"); - final GenotypeLikelihoods gq0 = GenotypeLikelihoods.fromPLField("20,0,200"); - - Assert.assertSame(model.getGLwithWorstGQ(gq10, gq20), gq10); - Assert.assertSame(model.getGLwithWorstGQ(gq20, gq10), gq10); - Assert.assertSame(model.getGLwithWorstGQ(gq10, gq0), gq0); - Assert.assertSame(model.getGLwithWorstGQ(gq0, gq10), gq0); - } - - @Test - public void testIndelLikelihoods() { - GenotypeLikelihoods prev = model.getIndelPLs(0); - Assert.assertEquals(prev.getAsPLs(), new int[]{0, 0, 0}); - Assert.assertEquals(-10 * prev.getLog10GQ(GenotypeType.HOM_REF), 0.0); - - for ( int i = 1; i <= ReferenceConfidenceModel.MAX_N_INDEL_INFORMATIVE_READS; i++ ) { - final GenotypeLikelihoods current = model.getIndelPLs(i); - final double prevGQ = -10 * prev.getLog10GQ(GenotypeType.HOM_REF); - final double currGQ = -10 * current.getLog10GQ(GenotypeType.HOM_REF); - Assert.assertTrue(prevGQ < currGQ, "GQ Failed with prev " + prev + " curr " + current + " at " + i); - Assert.assertTrue(prev.getAsPLs()[1] < current.getAsPLs()[1], "het PL failed with prev " + prev + " curr " + current + " at " + i); - Assert.assertTrue(prev.getAsPLs()[2] < current.getAsPLs()[2], "hom-var PL Failed with prev " + prev + " curr " + current + " at " + i); -// logger.warn("result at " + i + " is " + current); - prev = current; - } - } - - @Test - public void testOverlappingVariantContext() { - final VariantContext vc10 = GATKVariantContextUtils.makeFromAlleles("test", "chr1", 10, Arrays.asList("A", "C")); - final VariantContext vc13 = GATKVariantContextUtils.makeFromAlleles("test", "chr1", 13, Arrays.asList("A", "C")); - final VariantContext vc12_15 = GATKVariantContextUtils.makeFromAlleles("test", "chr1", 12, Arrays.asList("ACAT", "A")); - final VariantContext vc18 = GATKVariantContextUtils.makeFromAlleles("test", "chr1", 18, Arrays.asList("A", "ACAT")); - - final List calls = Arrays.asList(vc13, vc12_15, vc18, vc10); - - checkOverlapping(8, calls, null); - checkOverlapping(9, calls, null); - checkOverlapping(10, calls, vc10); - checkOverlapping(11, calls, null); - checkOverlapping(12, calls, vc12_15); - checkOverlapping(13, calls, vc13); - checkOverlapping(14, calls, vc12_15); - checkOverlapping(15, calls, vc12_15); - checkOverlapping(16, calls, null); - checkOverlapping(17, calls, null); - checkOverlapping(18, calls, vc18); - checkOverlapping(19, calls, null); - checkOverlapping(20, calls, null); - } - - private void checkOverlapping(final int pos, Collection calls, final VariantContext expected) { - final GenomeLoc loc = parser.createGenomeLoc(parser.getContigs().getSequences().get(0).getSequenceName(), pos, pos); - final VariantContext actual = model.getOverlappingVariantContext(loc, calls); - Assert.assertEquals(actual, expected); - } - - // - // test reference calculation - // - private class RefConfData { - final String ref; - final int extension; - final Haplotype refHap; - final GenomeLoc refLoc, paddedRefLoc; - final ActiveRegion region; - int readCounter = 0; - - private RefConfData(String ref, int extension) { - this.ref = ref; - this.extension = extension; - - refLoc = parser.createGenomeLoc("chr1", getStart(), getEnd()); - paddedRefLoc = parser.createGenomeLoc("chr1", getStart() - extension, getEnd() + extension); - region = new ActiveRegion(getRefLoc(), parser, extension); - final String pad = Utils.dupString("N", extension); - refHap = ReferenceConfidenceModel.createReferenceHaplotype(getActiveRegion(), (pad + ref + pad).getBytes(), getPaddedRefLoc()); - } - - public GenomeLoc getRefLoc() { return refLoc; } - public GenomeLoc getPaddedRefLoc() { return paddedRefLoc; } - public ActiveRegion getActiveRegion() { return region; } - public Haplotype getRefHap() { return refHap; } - public int getStart() { return 100; } - public int getEnd() { return getStart() + getRefLength() - 1; } - public byte[] getRefBases() { return ref.getBytes(); } - public int getRefLength() { return ref.length(); } - - public GATKSAMRecord makeRead(final int start, final int length) { - final byte[] quals = Utils.dupBytes((byte)30, length); - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read " + readCounter++, 0, start + getStart(), ref.substring(start, start + length).getBytes(), quals, length + "M"); - read.setReadGroup(rg); - return read; - } - } - - - @DataProvider(name = "RefConfidenceData") - public Object[][] makeRefConfidenceData() { - List tests = new ArrayList<>(); - - for ( int i = 0; i < 10; i++ ) { - for ( final int extension : Arrays.asList(0, 10) ) { - tests.add(new Object[]{i, extension}); - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "RefConfidenceData") - public void testRefConfidenceBasic(final int nReads, final int extension) { - final RefConfData data = new RefConfData("ACGTAACCGGTT", extension); - final List haplotypes = Arrays.asList(data.getRefHap()); - final List calls = Collections.emptyList(); - - for ( int i = 0; i < nReads; i++ ) { - data.getActiveRegion().add(data.makeRead(0, data.getRefLength())); - } - - final Map likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), new ArrayList<>(samples), data.getActiveRegion()); - - final List expectedDPs = Collections.nCopies(data.getActiveRegion().getLocation().size(), nReads); - final List contexts = model.calculateRefConfidence(data.getRefHap(), haplotypes, data.getPaddedRefLoc(), data.getActiveRegion(), likelihoods, calls); - checkReferenceModelResult(data, contexts, expectedDPs, calls); - } - - @Test - public void testRefConfidencePartialReads() { - final String ref = "ACGTAACCGGTT"; - for ( int readLen = 3; readLen < ref.length(); readLen++ ) { - for ( int start = 0; start < ref.length() - readLen; start++ ) { - final RefConfData data = new RefConfData(ref, 0); - final List haplotypes = Arrays.asList(data.getRefHap()); - final List calls = Collections.emptyList(); - - data.getActiveRegion().add(data.makeRead(start, readLen)); - final Map likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), new ArrayList<>(samples), data.getActiveRegion()); - - final List expectedDPs = new ArrayList<>(Collections.nCopies(data.getActiveRegion().getLocation().size(), 0)); - for ( int i = start; i < readLen + start; i++ ) expectedDPs.set(i, 1); - final List contexts = model.calculateRefConfidence(data.getRefHap(), haplotypes, data.getPaddedRefLoc(), data.getActiveRegion(), likelihoods, calls); - checkReferenceModelResult(data, contexts, expectedDPs, calls); - } - } - } - - @Test - public void testRefConfidenceWithCalls() { - final RefConfData xxxdata = new RefConfData("ACGTAACCGGTT", 0); - final int start = xxxdata.getStart(); - final int stop = xxxdata.getEnd(); - - for ( int nReads = 0; nReads < 2; nReads++ ) { - - final VariantContext vcStart = GATKVariantContextUtils.makeFromAlleles("test", "chr1", start, Arrays.asList("A", "C")); - final VariantContext vcEnd = GATKVariantContextUtils.makeFromAlleles("test", "chr1", stop, Arrays.asList("A", "C")); - final VariantContext vcMiddle = GATKVariantContextUtils.makeFromAlleles("test", "chr1", start + 2, Arrays.asList("A", "C")); - final VariantContext vcDel = GATKVariantContextUtils.makeFromAlleles("test", "chr1", start + 4, Arrays.asList("ACG", "A")); - final VariantContext vcIns = GATKVariantContextUtils.makeFromAlleles("test", "chr1", start + 8, Arrays.asList("A", "ACG")); - - final List allCalls = Arrays.asList(vcStart, vcEnd, vcMiddle, vcDel, vcIns); - - for ( int n = 1; n <= allCalls.size(); n++ ) { - for ( final List calls : Utils.makePermutations(allCalls, n, false) ) { -// logger.warn("Executing " + n + " " + calls.size()); - final RefConfData data = new RefConfData("ACGTAACCGGTT", 0); - final List haplotypes = Arrays.asList(data.getRefHap()); - for ( int i = 0; i < nReads; i++ ) { - data.getActiveRegion().add(data.makeRead(0, data.getRefLength())); - } - - final Map likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), new ArrayList<>(samples), data.getActiveRegion()); - - final List expectedDPs = Collections.nCopies(data.getActiveRegion().getLocation().size(), nReads); - final List contexts = model.calculateRefConfidence(data.getRefHap(), haplotypes, data.getPaddedRefLoc(), data.getActiveRegion(), likelihoods, calls); - checkReferenceModelResult(data, contexts, expectedDPs, calls); - } - } - } - } - - private void checkReferenceModelResult(final RefConfData data, final List contexts, final List expectedDPs, final List calls) { - Assert.assertNotNull(contexts); - - final GenomeLoc loc = data.getActiveRegion().getExtendedLoc(); - final List seenBP = new ArrayList<>(Collections.nCopies(data.getActiveRegion().getLocation().size(), false)); - - for ( int i = 0; i < loc.size(); i++ ) { - final GenomeLoc curPos = parser.createGenomeLoc(loc.getContig(), loc.getStart() + i); - final VariantContext call = model.getOverlappingVariantContext(curPos, calls); - final VariantContext refModel = model.getOverlappingVariantContext(curPos, contexts); - - if ( ! data.getActiveRegion().getLocation().containsP(curPos) ) { - // part of the extended interval, but not the full interval - Assert.assertNull(refModel); - continue; - } - - if ( call != null ) { - Assert.assertEquals(refModel, call, "Should have found call " + call + " but found " + refModel + " instead"); - } else { - final int expectedDP = expectedDPs.get(curPos.getStart() - data.getActiveRegion().getLocation().getStart()); - Assert.assertEquals(refModel.getStart(), loc.getStart() + i); - Assert.assertEquals(refModel.getEnd(), loc.getStart() + i); - Assert.assertFalse(refModel.hasLog10PError()); - Assert.assertEquals(refModel.getAlternateAlleles().size(), 1); - Assert.assertEquals(refModel.getAlternateAllele(0), GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - Assert.assertTrue(refModel.hasGenotype(sample)); - - final Genotype g = refModel.getGenotype(sample); - Assert.assertTrue(g.hasAD()); - Assert.assertTrue(g.hasDP()); - Assert.assertEquals(g.getDP(), expectedDP); - Assert.assertTrue(g.hasGQ()); - Assert.assertTrue(g.hasPL()); - } - - final VariantContext vc = call == null ? refModel : call; - if ( curPos.getStart() == vc.getStart() ) { - for ( int pos = vc.getStart(); pos <= vc.getEnd(); pos++ ) { - final int j = pos - data.getActiveRegion().getLocation().getStart(); - Assert.assertFalse(seenBP.get(j)); - seenBP.set(j, true); - } - } - } - - for ( int i = 0; i < seenBP.size(); i++ ) { - Assert.assertEquals((boolean)seenBP.get(i), true); - } - } -} \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RouteUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RouteUnitTest.java deleted file mode 100644 index 2918501b2..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RouteUnitTest.java +++ /dev/null @@ -1,261 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; - -import org.broadinstitute.sting.BaseTest; -import org.jgrapht.EdgeFactory; -import org.testng.Assert; -import org.testng.Reporter; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.*; - -/** - * Created with IntelliJ IDEA. - * User: valentin - * Date: 9/5/13 - * Time: 11:04 AM - * To change this template use File | Settings | File Templates. - */ -public class RouteUnitTest extends BaseTest { - - - @Test(dataProvider="isSuffixTestData") - public void testIsSuffix(final Route route, final Path path, final boolean expectedResult) { - Assert.assertEquals(route.isSuffix(path), expectedResult); - } - - @DataProvider(name="isSuffixTestData") - public Iterator isSuffixTestData() { - return TEST_DATA.iterator(); - } - - private static final int[] TEST_EDGE_PAIRS1 = new int[] { - 3 , 4, - 4 , 5, - 5, 7, - 7, 8, - 8, 9, - 4 , 6, - 6, 9, - 9, 11, - 11, 12, - }; - - - - private static final int[] TEST_EDGE_PAIRS = new int[] { - 1 , 2, - 2 , 3, - 3 , 4, - 4 , 5, - 5, 7, - 7, 8, - 8, 9, - 4 , 6, - 6, 9, - 9, 10, - 10, 11, - 11, 12, - 2, 5, - 5, 12, - - 3, 13, - 13, 14, - 14, 15 - }; - - public static final EdgeFactory TEST_GRAPH_EDGE_FACTORY = new EdgeFactory() { - @Override - public BaseEdge createEdge(final BaseVertex baseVertex, final BaseVertex baseVertex2) { - return new BaseEdge(false, 0); - } - }; - - - private static Map vertexByInteger = new HashMap<>(); - private static final BaseGraph TEST_GRAPH = new BaseGraph<>(1, TEST_GRAPH_EDGE_FACTORY); - private static final List TEST_DATA; - - - static { - for (int i = 0; i < TEST_EDGE_PAIRS.length; i += 2) { - final int sourceInteger = TEST_EDGE_PAIRS[i]; - final int targetInteger = TEST_EDGE_PAIRS[i + 1]; - final BaseVertex sourceVertex = resolveVertexByInteger(sourceInteger); - final BaseVertex targetVertex = resolveVertexByInteger(targetInteger); - TEST_GRAPH.addEdge(sourceVertex, targetVertex); - } - Assert.assertEquals(1,TEST_GRAPH.getSources().size()); - final Deque> pendingPaths = new LinkedList<>(); - final Deque> pendingRoutes = new LinkedList<>(); - final List> allPossiblePaths = new LinkedList<>(); - final List> allPossibleRoutes = new LinkedList<>(); - for (final BaseVertex vertex : TEST_GRAPH.vertexSet()) { - pendingPaths.add(new Path(vertex, TEST_GRAPH)); - pendingRoutes.add(new Route(vertex,TEST_GRAPH)); - } - while (!pendingPaths.isEmpty()) { // !pendingRoutes.isEmpty(); - final Path path = pendingPaths.remove(); - final Route route = pendingRoutes.remove(); - final BaseVertex lastVertex = path.getLastVertex(); - allPossiblePaths.add(path); - allPossibleRoutes.add(route); - - if (allPossiblePaths.size() % 100 == 0) - Reporter.log("" + allPossiblePaths.size(), true); - for (final BaseEdge edge : TEST_GRAPH.outgoingEdgesOf(lastVertex)) - pendingPaths.add(new Path<>(path,edge)); - for (final BaseEdge edge : TEST_GRAPH.outgoingEdgesOf(lastVertex)) - pendingRoutes.add(new Route<>(route,edge)); - } - - final int numberOfPaths = allPossiblePaths.size(); - final boolean[][] isSuffix = buildIsSuffixMatrix(allPossiblePaths, numberOfPaths); - TEST_DATA = createTestData(allPossiblePaths,allPossibleRoutes,isSuffix); - } - - private static boolean[][] buildIsSuffixMatrix(final List> allPossiblePaths, final int numberOfPaths) { - final boolean[][] isSuffix = new boolean[numberOfPaths][numberOfPaths]; - final ListIterator> iIterator = allPossiblePaths.listIterator(); - for (int i = 0; i < numberOfPaths; i++) { - isSuffix[i][i] = true; - final ListIterator> jIterator = allPossiblePaths.listIterator(i + 1); - final Path iPath = iIterator.next(); - for (int j = i + 1; j < numberOfPaths; j++) { - final Path jPath = jIterator.next(); - if (iPath.getLastVertex() != jPath.getLastVertex()) { - isSuffix[i][j] = isSuffix[j][i] = false; - } else { - isSuffix[i][j] = isSuffix[j][i] = true; // let assume they are suffix of each other by default. - final Path shortPath; - final Path longPath; - if (iPath.getEdges().size() <= jPath.getEdges().size()) { - shortPath = iPath; - longPath = jPath; - } else { - longPath = iPath; - shortPath = jPath; - } - final ListIterator longPathEdgesIterator = longPath.getEdges().listIterator(longPath.getEdges().size()); - final ListIterator shortPathEdgesIterator = shortPath.getEdges().listIterator(shortPath.getEdges().size()); - - while (shortPathEdgesIterator.hasPrevious()) { - final BaseEdge shortEdge = shortPathEdgesIterator.previous(); - final BaseEdge longEdge = longPathEdgesIterator.previous(); - if (shortEdge != longEdge) { - isSuffix[i][j] = isSuffix[j][i] = false; - break; - } - } - if (isSuffix[i][j]) { - if (longPathEdgesIterator.hasPrevious()) { - if (longPath == iPath) - isSuffix[j][i] = false; - else - isSuffix[i][j] = false; - } - } - } - - } - } - return isSuffix; - } - - private static List createTestData(final List> allPossiblePaths, final List> allPossibleRoutes, final boolean[][] isSuffix) { - final List result = new ArrayList<>(allPossiblePaths.size() * allPossiblePaths.size() * 2 ); - final Path[] allPaths = allPossiblePaths.toArray(new Path[allPossiblePaths.size()]); - final Route[] allRoutes = allPossibleRoutes.toArray(new Route[allPossibleRoutes.size()]); - final int numberOfPaths = allPaths.length; - for (int i = 0; i < numberOfPaths; i++) - for (int j = 0; j < numberOfPaths; j++) { - result.add(new Object[] { allRoutes[i], allPaths[j], isSuffix[i][j] }); - result.add(new Object[] { allRoutes[i], allRoutes[j], isSuffix[i][j] }); - result.add(new Object[] { allRoutes[i], inverseRebuild(allRoutes[j]), isSuffix[i][j]}); - } - - return result; - } - - private static Route inverseRebuild(final Route original) { - final ListIterator it = original.getEdges().listIterator(original.length()); - Route result = new Route<>(original.getLastVertex(),original.getGraph()); - while (it.hasPrevious()) { - result = new Route<>(it.previous(),result); - } - return result; - } - - private static BaseVertex resolveVertexByInteger(final int targetInteger) { - if (vertexByInteger.containsKey(targetInteger)) - return vertexByInteger.get(targetInteger); - else { - int value = targetInteger; - final StringBuffer stringBuffer = new StringBuffer(); - while (value > 0) { - int c = value % 4; - switch (c) { - case 0: stringBuffer.append('A'); break; - case 1: stringBuffer.append('C'); break; - case 2: stringBuffer.append('G'); break; - case 3: stringBuffer.append('T'); break; - } - value = value / 4; - } - if (stringBuffer.length() == 0) stringBuffer.append('A'); - final byte[] sequence = stringBuffer.reverse().toString().getBytes(); - final BaseVertex result = new BaseVertex(sequence); - vertexByInteger.put(targetInteger, result); - TEST_GRAPH.addVertex(result); - return result; - } - - } - - -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModelUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModelUnitTest.java deleted file mode 100644 index bbbef43d3..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModelUnitTest.java +++ /dev/null @@ -1,141 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.indels; - - -import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.SAMFileHeader; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.*; - -public class PairHMMIndelErrorModelUnitTest extends BaseTest { - - private SAMFileHeader header; - - @BeforeClass - public void setup() throws FileNotFoundException { - final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); - header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary()); - } - - private static final int refWindowStart = 1000; - private static final int refWindowEnd = 1100; - - @DataProvider(name = "ClipUpstreamProvider") - public Object[][] ClipUpstreamTestData() { - List tests = new ArrayList(); - - for ( final int readStart : Arrays.asList(900, 950, 990, 1000) ) { - for ( final int readLength : Arrays.asList(10, 50, 100) ) { - for ( final int delLength : Arrays.asList(0, 5, 10) ) { - tests.add(new Object[]{readStart, readLength, delLength}); - } - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "ClipUpstreamProvider", enabled = true) - public void clipUpstreamTest(final int readStart, final int readLength, final int delLength) { - - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, readStart, readLength); - if ( delLength == 0 ) - read.setCigarString(readLength + "M"); - else - read.setCigarString((readLength / 2) + "M" + delLength + "D" + (readLength / 2) + "M"); - - final boolean result = PairHMMIndelErrorModel.mustClipUpstream(read, refWindowStart); - Assert.assertEquals(result, read.getSoftStart() < refWindowStart && read.getSoftEnd() > refWindowStart); - } - - @DataProvider(name = "ClipDownstreamProvider") - public Object[][] ClipDownstreamTestData() { - List tests = new ArrayList(); - - for ( final int readStart : Arrays.asList(1000, 1050, 1090, 1100) ) { - for ( final int readLength : Arrays.asList(10, 50, 100) ) { - for ( final int delLength : Arrays.asList(0, 5, 10) ) { - tests.add(new Object[]{readStart, readLength, delLength}); - } - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "ClipDownstreamProvider", enabled = true) - public void clipDownstreamTest(final int readStart, final int readLength, final int delLength) { - - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, readStart, readLength); - if ( delLength == 0 ) - read.setCigarString(readLength + "M"); - else - read.setCigarString((readLength / 2) + "M" + delLength + "D" + (readLength / 2) + "M"); - - final boolean result = PairHMMIndelErrorModel.mustClipDownstream(read, refWindowEnd); - Assert.assertEquals(result, read.getSoftStart() < refWindowEnd && read.getSoftStart() + readLength > refWindowEnd); - } - - @Test - public void clipDownstreamAtBorderTest() { - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, 5, 10); - read.setCigarString("10M"); - Assert.assertEquals(PairHMMIndelErrorModel.mustClipDownstream(read, 13), true); - Assert.assertEquals(PairHMMIndelErrorModel.mustClipDownstream(read, 14), false); - } -} \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java deleted file mode 100644 index 9759004a0..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java +++ /dev/null @@ -1,149 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.phasing; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; - -import java.util.Arrays; - -public class ReadBackedPhasingIntegrationTest extends WalkerTest { - - public static String baseTestString(String reference, String reads, String VCF, int cacheWindowSize, int maxPhaseSites, double phaseQualityThresh) { - return "-T ReadBackedPhasing" + - " -R " + reference + - " -I " + validationDataLocation + reads + - " --variant " + ( VCF.contains("phasing_test") ? privateTestDir : validationDataLocation) + VCF + - " --cacheWindowSize " + cacheWindowSize + - " --maxPhaseSites " + maxPhaseSites + - " --phaseQualityThresh " + phaseQualityThresh + - " -o %s" + - " --no_cmdline_in_header"; - } - - - @Test - public void test1() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 10, 10) - + " -L chr20:332341-382503", - 1, - Arrays.asList("1c9a7fe4db41864cd85d16e5cf88986c")); - executeTest("MAX 10 het sites [TEST ONE]; require PQ >= 10", spec); - } - - @Test - public void test2() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 10, 10) - + " -L chr20:1232503-1332503", - 1, - Arrays.asList("a3ca151145379e0d4bae64a91165ea0b")); - executeTest("MAX 10 het sites [TEST TWO]; require PQ >= 10", spec); - } - - @Test - public void test3() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 2, 30) - + " -L chr20:332341-382503", - 1, - Arrays.asList("f685803333123a102ce1851d984cbd10")); - executeTest("MAX 2 het sites [TEST THREE]; require PQ >= 30", spec); - } - - @Test - public void test4() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 5, 100) - + " -L chr20:332341-382503", - 1, - Arrays.asList("aaa7c25d118383639f273128d241e140")); - executeTest("MAX 5 het sites [TEST FOUR]; require PQ >= 100", spec); - } - - @Test - public void test5() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 1000, 7, 10) - + " -L chr20:332341-482503", - 1, - Arrays.asList("418e29400762972e77bae4f73e16befe")); - executeTest("MAX 7 het sites [TEST FIVE]; require PQ >= 10; cacheWindow = 1000", spec); - } - - @Test - public void test6() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 10, 10) - + " -L chr20:652810-681757", - 1, - Arrays.asList("4c8f6190ecc86766baba3aba08542991")); - executeTest("MAX 10 het sites [TEST SIX]; require PQ >= 10; cacheWindow = 20000; has inconsistent sites", spec); - } - - @Test - public void test7() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "CEU.trio.2010_03.genotypes.hg18.vcf", 20000, 10, 10) - + " -L chr20:332341-802503", - 1, - Arrays.asList("44eb225ab3167651ec0a9e1fdcc83d34")); - executeTest("Use trio-phased VCF, but ignore its phasing [TEST SEVEN]", spec); - } - - @Test - public void test8() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "CEU.trio.2010_03.genotypes.hg18.vcf", 20000, 10, 10) - + " -L chr20:332341-802503" + " -respectPhaseInInput", - 1, - Arrays.asList("e3549b89d49092e73cc6eb21f233471c")); - executeTest("Use trio-phased VCF, and respect its phasing [TEST EIGHT]", spec); - } - -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java deleted file mode 100644 index 225000775..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ /dev/null @@ -1,329 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.variantrecalibration; - -import org.broadinstitute.sting.WalkerTest; -import org.broadinstitute.sting.utils.variant.GATKVCFUtils; -import org.broadinstitute.variant.variantcontext.Genotype; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.vcf.VCFCodec; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.util.Arrays; -import java.util.List; - -public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { - private static class VRTest { - String inVCF; - String aggregateVCF; - String tranchesMD5; - String recalMD5; - String cutVCFMD5; - - public VRTest(String inVCF, String tranchesMD5, String recalMD5, String cutVCFMD5) { - this.inVCF = inVCF; - this.tranchesMD5 = tranchesMD5; - this.recalMD5 = recalMD5; - this.cutVCFMD5 = cutVCFMD5; - } - - public VRTest(String inVCF, String aggregateVCF, String tranchesMD5, String recalMD5, String cutVCFMD5) { - this.inVCF = inVCF; - this.aggregateVCF = aggregateVCF; - this.tranchesMD5 = tranchesMD5; - this.recalMD5 = recalMD5; - this.cutVCFMD5 = cutVCFMD5; - } - - @Override - public String toString() { - return "VRTest{inVCF='" + inVCF +"'}"; - } - } - - VRTest lowPass = new VRTest(validationDataLocation + "phase1.projectConsensus.chr20.raw.snps.vcf", - "6f029dc7d16e63e19c006613cd0a5cff", // tranches - "73c7897441622c9b37376eb4f071c560", // recal file - "11a28df79b92229bd317ac49a3ed0fa1"); // cut VCF - - VRTest lowPassPlusExomes = new VRTest(validationDataLocation + "phase1.projectConsensus.chr20.raw.snps.vcf", - validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf", - "ce4bfc6619147fe7ce1f8331bbeb86ce", // tranches - "1b33c10be7d8bf8e9accd11113835262", // recal file - "4700d52a06f2ef3a5882719b86911e51"); // cut VCF - - @DataProvider(name = "VRTest") - public Object[][] createData1() { - return new Object[][]{ {lowPass} }; - } - - @DataProvider(name = "VRAggregateTest") - public Object[][] createData2() { - return new Object[][]{ {lowPassPlusExomes} }; - } - - @Test(dataProvider = "VRTest") - public void testVariantRecalibrator(VRTest params) { - //System.out.printf("PARAMS FOR %s is %s%n", vcf, clusterFile); - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-R " + b37KGReference + - " -resource:known=true,prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" + - " -resource:truth=true,training=true,prior=15.0 " + comparisonDataLocation + "Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" + - " -resource:training=true,truth=true,prior=12.0 " + comparisonDataLocation + "Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf" + - " -T VariantRecalibrator" + - " -input " + params.inVCF + - " -L 20:1,000,000-40,000,000" + - " --no_cmdline_in_header" + - " -an QD -an HaplotypeScore -an HRun" + - " --trustAllPolymorphic" + // for speed - " -recalFile %s" + - " -tranchesFile %s", - Arrays.asList(params.recalMD5, params.tranchesMD5)); - executeTest("testVariantRecalibrator-"+params.inVCF, spec).getFirst(); - } - - @Test(dataProvider = "VRTest",dependsOnMethods="testVariantRecalibrator") - public void testApplyRecalibration(VRTest params) { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-R " + b37KGReference + - " -T ApplyRecalibration" + - " -L 20:12,000,000-30,000,000" + - " --no_cmdline_in_header" + - " -input " + params.inVCF + - " -U LENIENT_VCF_PROCESSING -o %s" + - " -tranchesFile " + getMd5DB().getMD5FilePath(params.tranchesMD5, null) + - " -recalFile " + getMd5DB().getMD5FilePath(params.recalMD5, null), - Arrays.asList(params.cutVCFMD5)); - spec.disableShadowBCF(); // TODO -- enable when we support symbolic alleles - executeTest("testApplyRecalibration-"+params.inVCF, spec); - } - - @Test(dataProvider = "VRAggregateTest") - public void testVariantRecalibratorAggregate(VRTest params) { - //System.out.printf("PARAMS FOR %s is %s%n", vcf, clusterFile); - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-R " + b37KGReference + - " -resource:known=true,prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" + - " -resource:truth=true,training=true,prior=15.0 " + comparisonDataLocation + "Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" + - " -resource:training=true,truth=true,prior=12.0 " + comparisonDataLocation + "Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf" + - " -T VariantRecalibrator" + - " -input " + params.inVCF + - " -aggregate " + params.aggregateVCF + - " -L 20:1,000,000-40,000,000" + - " --no_cmdline_in_header" + - " -an QD -an HaplotypeScore -an MQ" + - " --trustAllPolymorphic" + // for speed - " -recalFile %s" + - " -tranchesFile %s", - Arrays.asList(params.recalMD5, params.tranchesMD5)); - executeTest("testVariantRecalibratorAggregate-"+params.inVCF, spec).getFirst(); - } - - @Test(dataProvider = "VRAggregateTest",dependsOnMethods="testVariantRecalibratorAggregate") - public void testApplyRecalibrationAggregate(VRTest params) { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-R " + b37KGReference + - " -T ApplyRecalibration" + - " -L 20:12,000,000-30,000,000" + - " --no_cmdline_in_header" + - " -input " + params.inVCF + - " -U LENIENT_VCF_PROCESSING -o %s" + - " -tranchesFile " + getMd5DB().getMD5FilePath(params.tranchesMD5, null) + - " -recalFile " + getMd5DB().getMD5FilePath(params.recalMD5, null), - Arrays.asList(params.cutVCFMD5)); - spec.disableShadowBCF(); // TODO -- enable when we support symbolic alleles - executeTest("testApplyRecalibrationAggregate-"+params.inVCF, spec); - } - - VRTest bcfTest = new VRTest(privateTestDir + "vqsr.bcf_test.snps.unfiltered.bcf", - "3ad7f55fb3b072f373cbce0b32b66df4", // tranches - "e747c08131d58d9a4800720f6ca80e0c", // recal file - "e5808af3af0f2611ba5a3d172ab2557b"); // cut VCF - - @DataProvider(name = "VRBCFTest") - public Object[][] createVRBCFTest() { - return new Object[][]{ {bcfTest} }; - } - - @Test(dataProvider = "VRBCFTest") - public void testVariantRecalibratorWithBCF(VRTest params) { - //System.out.printf("PARAMS FOR %s is %s%n", vcf, clusterFile); - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-R " + b37KGReference + - " -resource:known=true,prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" + - " -resource:truth=true,training=true,prior=15.0 " + comparisonDataLocation + "Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" + - " -resource:training=true,truth=true,prior=12.0 " + comparisonDataLocation + "Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf" + - " -T VariantRecalibrator" + - " -input " + params.inVCF + - " -L 20:10,000,000-20,000,000" + - " --no_cmdline_in_header" + - " -an AC " + // integer value - " -an QD -an ReadPosRankSum -an FS -an InbreedingCoeff " + // floats value - " -mG 2 "+ - " -recalFile %s" + - " -tranchesFile %s", - 2, - Arrays.asList("bcf", "txt"), - Arrays.asList(params.recalMD5, params.tranchesMD5)); - executeTest("testVariantRecalibrator-"+params.inVCF, spec).getFirst(); - } - - @Test(dataProvider = "VRBCFTest", dependsOnMethods="testVariantRecalibratorWithBCF") - public void testApplyRecalibrationWithBCF(VRTest params) { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-R " + b37KGReference + - " -T ApplyRecalibration" + - " -L 20:10,000,000-20,000,000" + - " --no_cmdline_in_header" + - " -input " + params.inVCF + - " -U LENIENT_VCF_PROCESSING -o %s" + - " -tranchesFile " + getMd5DB().getMD5FilePath(params.tranchesMD5, null) + - " -recalFile " + getMd5DB().getMD5FilePath(params.recalMD5, null), - Arrays.asList(params.cutVCFMD5)); - spec.disableShadowBCF(); - executeTest("testApplyRecalibration-"+params.inVCF, spec); - } - - - VRTest indelUnfiltered = new VRTest( - validationDataLocation + "combined.phase1.chr20.raw.indels.unfiltered.sites.vcf", // all FILTERs as . - "9a331328370889168a7aa3a625f73620", // tranches - "2cbbd146d68c40200b782e0226f71976", // recal file - "64dd98a5ab80cf5fd9a36eb66b38268e"); // cut VCF - - VRTest indelFiltered = new VRTest( - validationDataLocation + "combined.phase1.chr20.raw.indels.filtered.sites.vcf", // all FILTERs as PASS - "9a331328370889168a7aa3a625f73620", // tranches - "2cbbd146d68c40200b782e0226f71976", // recal file - "c0ec662001e829f5779a9d13b1d77d80"); // cut VCF - - @DataProvider(name = "VRIndelTest") - public Object[][] createTestVariantRecalibratorIndel() { - return new Object[][]{ {indelUnfiltered}, {indelFiltered} }; - } - - @Test(dataProvider = "VRIndelTest") - public void testVariantRecalibratorIndel(VRTest params) { - //System.out.printf("PARAMS FOR %s is %s%n", vcf, clusterFile); - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-R " + b37KGReference + - " -resource:known=true,prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" + - " -resource:training=true,truth=true,prior=15.0 " + comparisonDataLocation + "Validated/Mills_Devine_Indels_2011/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.vcf" + - " -T VariantRecalibrator" + - " -input " + params.inVCF + - " -L 20:1,000,000-40,000,000" + - " --no_cmdline_in_header" + - " -an QD -an ReadPosRankSum -an HaplotypeScore" + - " -mode INDEL -mG 3" + - " --trustAllPolymorphic" + // for speed - " -recalFile %s" + - " -tranchesFile %s", - Arrays.asList(params.recalMD5, params.tranchesMD5)); - executeTest("testVariantRecalibratorIndel-"+params.inVCF, spec).getFirst(); - } - - @Test(dataProvider = "VRIndelTest",dependsOnMethods="testVariantRecalibratorIndel") - public void testApplyRecalibrationIndel(VRTest params) { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-R " + b37KGReference + - " -T ApplyRecalibration" + - " -L 20:12,000,000-30,000,000" + - " -mode INDEL" + - " -U LENIENT_VCF_PROCESSING --no_cmdline_in_header" + - " -input " + params.inVCF + - " -o %s" + - " -tranchesFile " + getMd5DB().getMD5FilePath(params.tranchesMD5, null) + - " -recalFile " + getMd5DB().getMD5FilePath(params.recalMD5, null), - Arrays.asList(params.cutVCFMD5)); - spec.disableShadowBCF(); // has to be disabled because the input VCF is missing LowQual annotation - executeTest("testApplyRecalibrationIndel-" + params.inVCF, spec); - } - - @Test - public void testApplyRecalibrationSnpAndIndelTogether() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-R " + b37KGReference + - " -T ApplyRecalibration" + - " -L 20:1000100-1000500" + - " -mode BOTH" + - " --no_cmdline_in_header" + - " -input " + privateTestDir + "VQSR.mixedTest.input" + - " -o %s" + - " -tranchesFile " + privateTestDir + "VQSR.mixedTest.tranches" + - " -recalFile " + privateTestDir + "VQSR.mixedTest.recal", - Arrays.asList("03a0ed00af6aac76d39e569f90594a02")); - executeTest("testApplyRecalibrationSnpAndIndelTogether", spec); - } - - @Test(enabled = true) - public void testApplyRecalibrationSnpAndIndelTogetherExcludeFiltered() throws Exception { - final String base = "-R " + b37KGReference + - " -T ApplyRecalibration" + - " -L 20:1000100-1000500" + - " -mode BOTH" + - " --excludeFiltered -ts_filter_level 90.0" + - " --no_cmdline_in_header" + - " -input " + privateTestDir + "VQSR.mixedTest.input" + - " -o %s" + - " -tranchesFile " + privateTestDir + "VQSR.mixedTest.tranches" + - " -recalFile " + privateTestDir + "VQSR.mixedTest.recal"; - - final WalkerTestSpec spec = new WalkerTestSpec(base, 1, Arrays.asList("")); - spec.disableShadowBCF(); - final File VCF = executeTest("testApplyRecalibrationSnpAndIndelTogether", spec).first.get(0); - - for( final VariantContext VC : GATKVCFUtils.readAllVCs(VCF, new VCFCodec()).getSecond() ) { - if( VC != null ) { - Assert.assertTrue(VC.isNotFiltered()); // there should only be unfiltered records in the output VCF file - } - } - } -} - diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineReferenceCalculationVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineReferenceCalculationVariantsIntegrationTest.java deleted file mode 100644 index 7b546b8db..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineReferenceCalculationVariantsIntegrationTest.java +++ /dev/null @@ -1,72 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.variantutils; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; - -import java.util.Arrays; - -public class CombineReferenceCalculationVariantsIntegrationTest extends WalkerTest { - - private static String baseTestString(String args, String ref) { - return "-T CombineReferenceCalculationVariants --no_cmdline_in_header -L 1:1-50,000,000 -o %s -R " + ref + args; - } - - // TODO -- enable this test (and create others) once the Haplotype Caller produces appropriate gVCFs (with for every record) - @Test(enabled = false) - public void combineSingleSamplePipelineGVCF() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(" -V:sample1 " + privateTestDir + "combine.single.sample.pipeline.1.vcf" + - " -V:sample2 " + privateTestDir + "combine.single.sample.pipeline.2.vcf" + - " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + - " -L 20:10,000,000-10,001,000", b37KGReference), - 1, - Arrays.asList("0413f0725fc5ec3a4f1ee246f6cb3a2a")); - executeTest("combineSingleSamplePipelineGVCF", spec); - } -} \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariantsIntegrationTest.java deleted file mode 100644 index a7d32d43b..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariantsIntegrationTest.java +++ /dev/null @@ -1,77 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.variantutils; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; - -import java.util.Arrays; - -/** - * Tests LeftAlignAndTrimVariants - */ -public class LeftAlignAndTrimVariantsIntegrationTest extends WalkerTest { - - @Test - public void testLeftAlignment() { - WalkerTestSpec spec = new WalkerTestSpec( - "-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "forLeftAlignVariantsTest.vcf --no_cmdline_in_header", - 1, - Arrays.asList("bcf05f56adbb32a47b6d6b27b327d5c2")); - executeTest("test left alignment", spec); - } - - @Test - public void testLeftAlignmentWithTrimmingAndMultialleliecs() { - WalkerTestSpec spec = new WalkerTestSpec( - "-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "forHardLeftAlignVariantsTest.vcf --no_cmdline_in_header -trim -split", - 1, - Arrays.asList("4ae03954f8bd66e73fd005c49ea301db")); - executeTest("test left alignment with trimming and hard multiple alleles", spec); - - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java deleted file mode 100644 index 884b46692..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ /dev/null @@ -1,347 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.variantutils; - -import org.broadinstitute.sting.WalkerTest; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.testng.annotations.Test; - -import java.util.Arrays; - -public class SelectVariantsIntegrationTest extends WalkerTest { - public static String baseTestString(String args) { - return "-T SelectVariants -R " + b36KGReference + " -L 1 -o %s --no_cmdline_in_header" + args; - } - - @Test - public void testDiscordanceNoSampleSpecified() { - String testFile = privateTestDir + "NA12878.hg19.example1.vcf"; - - WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + hg19Reference + " -L 20:1012700-1020000 --variant " - + b37hapmapGenotypes + " -disc " + testFile - + " -o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING", - 1, - Arrays.asList("954415f84996d27b07d00855e96d33a2") - ); - spec.disableShadowBCF(); - - executeTest("testDiscordanceNoSampleSpecified--" + testFile, spec); - } - - @Test - public void testRepeatedLineSelection() { - String testfile = privateTestDir + "test.dup.vcf"; - - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(" -sn A -sn B -sn C --variant " + testfile), - 1, - Arrays.asList("125d1c9fa111cd38dfa2ff3900f16b57") - ); - - executeTest("testRepeatedLineSelection--" + testfile, spec); - } - - @Test - public void testDiscordance() { - String testFile = privateTestDir + "NA12878.hg19.example1.vcf"; - - WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 --variant " - + b37hapmapGenotypes + " -disc " + testFile - + " -o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING", - 1, - Arrays.asList("ca1b5226eaeaffb78d4abd9d2ee10c43") - ); - spec.disableShadowBCF(); - - executeTest("testDiscordance--" + testFile, spec); - } - - @Test - public void testComplexSelection() { - String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; - String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; - - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile), - 1, - Arrays.asList("4386fbb258dcef4437495a37f5a83c53") - ); - spec.disableShadowBCF(); - executeTest("testComplexSelection--" + testfile, spec); - } - - @Test - public void testComplexSelectionWithNonExistingSamples() { - String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; - String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; - - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(" --ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES -sn A -se '[CDH]' -sn Z -sn T -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile), - 1, - Arrays.asList("4386fbb258dcef4437495a37f5a83c53") - ); - spec.disableShadowBCF(); - executeTest("testComplexSelectionWithNonExistingSamples--" + testfile, spec); - } - - @Test - public void testNonExistingFieldSelection() { - String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; - - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(" -env -ef -select 'foo!=0||DP>0' --variant " + testfile), - 1, - Arrays.asList("44e77cea624cfff2b8acc3a4b30485cb") // should yield empty vcf because the foo!=0 will yield complete expression false - ); - spec.disableShadowBCF(); - executeTest("testNonExistingSelection--" + testfile, spec); - } - - @Test - public void testSampleExclusionFromFileAndSeparateSample() { - String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; - String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; - - WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -xl_sn A -xl_sf " + samplesFile + " --variant " + testfile, - 1, - Arrays.asList("1f5c72951a35667c4bdf1be153787e27") - ); - spec.disableShadowBCF(); - - executeTest("testSampleExclusion--" + testfile, spec); - } - - @Test - public void testSampleExclusionJustFromFile() { - String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; - String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; - - WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -xl_sf " + samplesFile + " --variant " + testfile, - 1, - Arrays.asList("875d7e00ac8081e87ab9fb1b20c83677") - ); - spec.disableShadowBCF(); - - executeTest("testSampleExclusion--" + testfile, spec); - } - - @Test - public void testSampleInclusionWithNonexistingSamples() { - String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; - String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; - - WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -sn A -sn Z -sn Q -sf " + samplesFile + " --variant " + testfile, - 1, - UserException.BadInput.class - ); - spec.disableShadowBCF(); - - executeTest("testSampleInclusionWithNonexistingSamples--" + testfile, spec); - } - - - @Test - public void testConcordance() { - String testFile = privateTestDir + "NA12878.hg19.example1.vcf"; - - WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 -conc " - + b37hapmapGenotypes + " --variant " + testFile - + " -o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING", - 1, - Arrays.asList("946e7f2e0ae08dc0e931c1634360fc46") - ); - spec.disableShadowBCF(); - - executeTest("testConcordance--" + testFile, spec); - } - - @Test - public void testVariantTypeSelection() { - String testFile = privateTestDir + "complexExample1.vcf"; - - WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + b36KGReference + " -restrictAllelesTo MULTIALLELIC -selectType MIXED --variant " + testFile + " -o %s --no_cmdline_in_header", - 1, - Arrays.asList("ca2b70e3171420b08b0a2659bfe2a794") - ); - - executeTest("testVariantTypeSelection--" + testFile, spec); - } - - @Test - public void testIndelLengthSelection() { - String testFile = privateTestDir + "complexExample1.vcf"; - - WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + b36KGReference + " -selectType INDEL --variant " + testFile + " -o %s --no_cmdline_in_header --maxIndelSize 3", - 1, - Arrays.asList("004589868ca5dc887e2dff876b4cc797") - ); - - executeTest("testIndelLengthSelection--" + testFile, spec); - } - - @Test - public void testUsingDbsnpName() { - String testFile = privateTestDir + "combine.3.vcf"; - - WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + b36KGReference + " -sn NA12892 --variant:dbsnp " + testFile + " -o %s --no_cmdline_in_header", - 1, - Arrays.asList("a554459c9ccafb9812ff6d8c06c11726") - ); - - executeTest("testUsingDbsnpName--" + testFile, spec); - } - - @Test - public void testRemoveMLE() { - String testFile = privateTestDir + "vcfexample.withMLE.vcf"; - - WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + b36KGReference + " -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header", - 1, - Arrays.asList("a554459c9ccafb9812ff6d8c06c11726") - ); - - executeTest("testRemoveMLE--" + testFile, spec); - } - - @Test - public void testKeepOriginalAC() { - String testFile = privateTestDir + "vcfexample.loseAlleleInSelection.vcf"; - - WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants --keepOriginalAC -R " + b36KGReference + " -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header", - 1, - Arrays.asList("ad7e8b25e431a3229a78cec063876559") - ); - - executeTest("testKeepOriginalAC--" + testFile, spec); - } - - @Test - public void testKeepOriginalACAndENV() { - String testFile = privateTestDir + "vcfexample.loseAlleleInSelection.vcf"; - - WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants --keepOriginalAC -env -R " + b36KGReference + " -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header", - 1, - Arrays.asList("e9b8292212545684cdb163423329ee7e") - ); - - executeTest("testKeepOriginalACAndENV--" + testFile, spec); - } - - @Test - public void testMultipleRecordsAtOnePosition() { - String testFile = privateTestDir + "selectVariants.onePosition.vcf"; - - WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + b36KGReference + " -select 'KG_FREQ < 0.5' --variant " + testFile + " -o %s --no_cmdline_in_header", - 1, - Arrays.asList("44f7c47395ca5b2afef5313f592c8cea") - ); - - executeTest("testMultipleRecordsAtOnePosition--" + testFile, spec); - } - - @Test - public void testNoGTs() { - String testFile = privateTestDir + "vcf4.1.example.vcf"; - - WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + b37KGReference + " --variant " + testFile + " -o %s --no_cmdline_in_header", - 1, - Arrays.asList("ef3c5f75074a5dd2b2cd2715856a2542") - ); - - executeTest("testNoGTs--" + testFile, spec); - } - - @Test - public void testSelectFromMultiAllelic() { - String testfile = privateTestDir + "multi-allelic.bi-allelicInGIH.vcf"; - String samplesFile = privateTestDir + "GIH.samples.list"; - WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + b37KGReference + " -o %s --no_cmdline_in_header -sf " + samplesFile + " --excludeNonVariants --variant " + testfile, - 1, - Arrays.asList("69862fb97e8e895fe65c7abb14b03cee") - ); - executeTest("test select from multi allelic with excludeNonVariants --" + testfile, spec); - } - - @Test() - public void testFileWithoutInfoLineInHeader() { - testFileWithoutInfoLineInHeader("testFileWithoutInfoLineInHeader", IllegalStateException.class); - } - - @Test() - public void testFileWithoutInfoLineInHeaderWithOverride() { - testFileWithoutInfoLineInHeader("testFileWithoutInfoLineInHeaderWithOverride", null); - } - - private void testFileWithoutInfoLineInHeader(final String name, final Class expectedException) { - final String testFile = privateTestDir + "missingHeaderLine.vcf"; - final String cmd = "-T SelectVariants -R " + b36KGReference + " -sn NA12892 --variant:dbsnp " - + testFile + " -o %s --no_cmdline_in_header" - + (expectedException == null ? " -U LENIENT_VCF_PROCESSING" : ""); - WalkerTestSpec spec = - expectedException != null - ? new WalkerTestSpec(cmd, 1, expectedException) - : new WalkerTestSpec(cmd, 1, Arrays.asList("")); - spec.disableShadowBCF(); - - executeTest(name, spec); - } -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsParallelIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsParallelIntegrationTest.java deleted file mode 100644 index 4d7fa28ad..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsParallelIntegrationTest.java +++ /dev/null @@ -1,105 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.variantutils; - -import org.broadinstitute.sting.WalkerTest; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.Arrays; - -public class SelectVariantsParallelIntegrationTest extends WalkerTest { - - private class ParallelSelectTestProvider extends TestDataProvider { - final String reference; - final String args; - final String md5; - final int nt; - - private ParallelSelectTestProvider(final String reference, final String args, final String md5, final int nt) { - super(ParallelSelectTestProvider.class); - this.reference = reference; - this.args = args; - this.md5 = md5; - this.nt = nt; - } - - public final String getCmdLine() { - return "-T SelectVariants -R " + reference + " -o %s -L 1 --no_cmdline_in_header -nt " + nt + " " + args; - } - - public String toString() { - return String.format("ParallelSelectVariants nt=%d args=%s", nt, args); - } - } - - @DataProvider(name = "ParallelSelectTest") - public Object[][] makeParallelSelectTestProvider() { - for ( int nt : Arrays.asList(1, 2, 4) ) { - { // original MAF test - String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; - String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; - String args = " -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile; - new ParallelSelectTestProvider(b36KGReference, args, "4386fbb258dcef4437495a37f5a83c53", nt); - } - { // new tests on b37 using testdir VCF - final String testfile = privateTestDir + "NA12878.hg19.example1.vcf"; - final String args = "-select 'DP > 30' -V " + testfile; - new ParallelSelectTestProvider(b37KGReference, args, "c64b45a14d41b1e5cddbe036b47e7519", nt); - } - } - - return ParallelSelectTestProvider.getTests(ParallelSelectTestProvider.class); - } - - @Test(dataProvider = "ParallelSelectTest") - public void testParallelSelectTestProvider(final ParallelSelectTestProvider cfg) { - final WalkerTestSpec spec = new WalkerTestSpec( cfg.getCmdLine(), 1, Arrays.asList(cfg.md5) ); - executeTest(cfg.toString(), spec); - } -} diff --git a/protected/java/test/org/broadinstitute/sting/utils/RandomDNA.java b/protected/java/test/org/broadinstitute/sting/utils/RandomDNA.java deleted file mode 100644 index 88f5910f7..000000000 --- a/protected/java/test/org/broadinstitute/sting/utils/RandomDNA.java +++ /dev/null @@ -1,127 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ -package org.broadinstitute.sting.utils; - -import com.sun.istack.internal.NotNull; - -import java.util.Random; - -/** - * Random DNA sequence generator. - * - *

- * Returned bases are always in upper case and one of the valid four nocleotides 'A', 'C', 'G' and 'T'. - *

- * - * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> - */ -public class RandomDNA { - - private Random random; - - /** - * Constructs a new random DNA generator. - * - *

- * The seed would be the default which would depend on system properties and the current time as - * described in {@link Random} documentation. - *

- */ - public RandomDNA() { - random = new Random(); - } - - /** - * Constructs a new random DNA generator providing a seed. - * - * @param seed the random number generator seed. - */ - public RandomDNA(final long seed) { - random = new Random(seed); - } - - /** - * Updates the content of a byte array with a random base sequence. - * - *

- * The whole array will be filled with new base values. - *

- * - * @param destination the array to update. - * - * @throws NullPointerException if {@code destination} is {@code null}. - */ - public void nextBases(final byte[] destination) { - random.nextBytes(destination); - for (int i = 0; i < destination.length; i++) { - final int ord = destination[i] & 0x03; - switch (ord) { - case 0: destination[i] = 'A'; break; - case 1: destination[i] = 'C'; break; - case 2: destination[i] = 'G'; break; - case 3: destination[i] = 'T'; break; - default: throw new IllegalStateException("this cannot be happening!!!"); - } - } - } - - /** - * Returns a random RNA sequence of bases. - * @param size the length of the sequence. - * - * @throws IllegalArgumentException if {@code size} is negative. - * @return never {@code null}. - */ - @NotNull - public byte[] nextBases(final int size) { - if (size < 0) throw new IllegalArgumentException("the size cannot be negative"); - final byte[] result = new byte[size]; - nextBases(result); - return result; - } - - -} diff --git a/protected/java/test/org/broadinstitute/sting/utils/gvcf/GVCFWriterUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/gvcf/GVCFWriterUnitTest.java deleted file mode 100644 index 5c14c490e..000000000 --- a/protected/java/test/org/broadinstitute/sting/utils/gvcf/GVCFWriterUnitTest.java +++ /dev/null @@ -1,362 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.utils.gvcf; - -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.ReferenceConfidenceModel; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.variantcontext.*; -import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; -import org.broadinstitute.variant.vcf.VCFConstants; -import org.broadinstitute.variant.vcf.VCFHeader; -import org.testng.Assert; -import org.testng.annotations.BeforeMethod; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; - -public class GVCFWriterUnitTest extends BaseTest { - private static class MockWriter implements VariantContextWriter { - final List emitted = new ArrayList<>(); - boolean headerWritten = false; - boolean closed = false; - - @Override - public void writeHeader(VCFHeader header) { - headerWritten = true; - } - - @Override - public void close() { - closed = true; - } - - @Override - public void add(VariantContext vc) { - emitted.add(vc); - } - } - - private MockWriter mockWriter; - private List standardPartition = Arrays.asList(1, 10, 20); - private Allele REF = Allele.create("N", true); - private Allele ALT = Allele.create("A"); - private List ALLELES = Arrays.asList(REF, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - private final String SAMPLE_NAME = "XXYYZZ"; - - @BeforeMethod - public void setUp() throws Exception { - mockWriter = new MockWriter(); - } - - @Test - public void testHeaderWriting() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); - writer.writeHeader(new VCFHeader()); - Assert.assertTrue(mockWriter.headerWritten); - } - - @Test - public void testClose() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); - writer.close(); - Assert.assertTrue(mockWriter.closed); - } - - @Test - public void testCloseWithoutClosingUnderlyingWriter() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); - writer.close(false); - Assert.assertFalse(mockWriter.closed); - } - - private VariantContext makeHomRef(final String contig, final int start, final int GQ) { - final VariantContextBuilder vcb = new VariantContextBuilder("test", contig, start, start, ALLELES); - final GenotypeBuilder gb = new GenotypeBuilder(SAMPLE_NAME, Arrays.asList(REF, REF)); - gb.GQ(GQ); - gb.DP(10); - gb.AD(new int[]{1, 2}); - gb.PL(new int[]{0, 10, 100}); - return vcb.genotypes(gb.make()).make(); - } - - private VariantContext makeHomRefAlt(final String contig, final int start, final int GQ) { - final VariantContextBuilder vcb = new VariantContextBuilder("test", contig, start, start, Arrays.asList(REF, ALT)); - final GenotypeBuilder gb = new GenotypeBuilder(SAMPLE_NAME, Arrays.asList(REF, REF)); - gb.GQ(GQ); - gb.DP(10); - gb.AD(new int[]{1, 2}); - gb.PL(new int[]{0, 10, 100}); - return vcb.genotypes(gb.make()).make(); - } - - private VariantContext makeNonRef(final String contig, final int start, final int GQ) { - final VariantContextBuilder vcb = new VariantContextBuilder("test", contig, start, start, Arrays.asList(REF, ALT)); - final GenotypeBuilder gb = new GenotypeBuilder(SAMPLE_NAME, Arrays.asList(REF, ALT)); - gb.GQ(GQ); - gb.DP(10); - gb.AD(new int[]{1, 2}); - gb.PL(new int[]{0, 10, 100}); - return vcb.genotypes(gb.make()).make(); - } - - private VariantContext makeDeletion(final String contig, final int start, final int size) { - final String del = Utils.dupString("A", size); - final String alt = del.substring(0, 1); - final VariantContext vc = GATKVariantContextUtils.makeFromAlleles("test", contig, start, Arrays.asList(del, alt)); - final VariantContextBuilder vcb = new VariantContextBuilder(vc); - final GenotypeBuilder gb = new GenotypeBuilder(SAMPLE_NAME, Arrays.asList(vc.getReference(), vc.getAlternateAllele(0))); - gb.GQ(50); - gb.DP(10); - gb.AD(new int[]{1, 2}); - gb.PL(new int[]{0, 10, 100}); - return vcb.genotypes(gb.make()).make(); - } - - @Test - public void testCloseEmitsLastVariant() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); - - writer.add(makeHomRef("20", 1, 30)); - Assert.assertEquals(mockWriter.emitted.size(), 0); - - writer.close(); - Assert.assertTrue(mockWriter.closed); - Assert.assertEquals(mockWriter.emitted.size(), 1); - } - - @Test - public void testCloseDoesntEmitsLastVariantWhenNonRef() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); - - writer.add(makeNonRef("20", 1, 30)); - Assert.assertEquals(mockWriter.emitted.size(), 1); - - writer.close(); - Assert.assertTrue(mockWriter.closed); - Assert.assertEquals(mockWriter.emitted.size(), 1); - } - - @Test - public void testCrossingContigBoundaryRef() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); - - writer.add(makeHomRef("20", 1, 30)); - writer.add(makeHomRef("20", 2, 30)); - Assert.assertEquals(mockWriter.emitted.size(), 0); - writer.add(makeHomRef("21", 3, 30)); - Assert.assertEquals(mockWriter.emitted.size(), 1); - assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false); - - writer.close(); - Assert.assertEquals(mockWriter.emitted.size(), 2); - assertGoodVC(mockWriter.emitted.get(1), "21", 3, 3, false); - } - - @Test - public void testCrossingContigBoundaryNonRef() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); - - writer.add(makeHomRef("20", 1, 30)); - writer.add(makeHomRef("20", 2, 30)); - Assert.assertEquals(mockWriter.emitted.size(), 0); - writer.add(makeNonRef("21", 3, 30)); - Assert.assertEquals(mockWriter.emitted.size(), 2); - assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false); - assertGoodVC(mockWriter.emitted.get(1), "21", 3, 3, true); - } - - @Test - public void testCrossingContigBoundaryNonRefThenNonRef() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); - - writer.add(makeNonRef("20", 1, 30)); - Assert.assertEquals(mockWriter.emitted.size(), 1); - writer.add(makeNonRef("21", 1, 30)); - Assert.assertEquals(mockWriter.emitted.size(), 2); - assertGoodVC(mockWriter.emitted.get(0), "20", 1, 1, true); - assertGoodVC(mockWriter.emitted.get(1), "21", 1, 1, true); - } - - private void assertGoodVC(final VariantContext vc, final String contig, final int start, final int stop, final boolean nonRef) { - Assert.assertEquals(vc.getChr(), contig); - Assert.assertEquals(vc.getStart(), start); - Assert.assertEquals(vc.getEnd(), stop); - if ( nonRef ) { - Assert.assertNotEquals(vc.getAlternateAllele(0), GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - } else { - Assert.assertEquals(vc.getNAlleles(), 2); - Assert.assertEquals(vc.getAlternateAllele(0), GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - Assert.assertEquals(vc.getAttributeAsInt(GVCFWriter.BLOCK_SIZE_INFO_FIELD, -1), stop - start + 1); - Assert.assertEquals(vc.getAttributeAsInt(VCFConstants.END_KEY, -1), stop); - Assert.assertTrue(vc.hasGenotypes()); - Assert.assertTrue(vc.hasGenotype(SAMPLE_NAME)); - Assert.assertEquals(vc.getGenotypes().size(), 1); - final Genotype g = vc.getGenotype(SAMPLE_NAME); - Assert.assertEquals(g.hasAD(), false); - Assert.assertEquals(g.hasLikelihoods(), true); - Assert.assertEquals(g.hasPL(), true); - Assert.assertEquals(g.getPL().length == 3, true); - Assert.assertEquals(g.hasDP(), true); - Assert.assertEquals(g.hasGQ(), true); - } - } - - @Test - public void testVariantForcesNonRef() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); - - writer.add(makeHomRef("20", 1, 30)); - writer.add(makeHomRef("20", 2, 30)); - Assert.assertEquals(mockWriter.emitted.size(), 0); - writer.add(makeNonRef("20", 3, 30)); - writer.add(makeHomRef("20", 4, 30)); - writer.add(makeHomRef("20", 5, 30)); - Assert.assertEquals(mockWriter.emitted.size(), 2); - assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false); - assertGoodVC(mockWriter.emitted.get(1), "20", 3, 3, true); - writer.close(); - assertGoodVC(mockWriter.emitted.get(2), "20", 4, 5, false); - } - - @Test - public void testEmittingTwoBands() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); - - writer.add(makeHomRef("20", 1, 0)); - writer.add(makeHomRef("20", 2, 0)); - Assert.assertEquals(mockWriter.emitted.size(), 0); - writer.add(makeHomRef("20", 3, 50)); - writer.add(makeHomRef("20", 4, 50)); - writer.close(); - Assert.assertEquals(mockWriter.emitted.size(), 2); - assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false); - assertGoodVC(mockWriter.emitted.get(1), "20", 3, 4, false); - } - - @Test - public void testNonContiguousBlocks() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); - - writer.add(makeHomRef("20", 1, 0)); - writer.add(makeHomRef("20", 2, 0)); - writer.add(makeHomRef("20", 10, 0)); - writer.add(makeHomRef("20", 11, 0)); - writer.close(); - Assert.assertEquals(mockWriter.emitted.size(), 2); - assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false); - assertGoodVC(mockWriter.emitted.get(1), "20", 10, 11, false); - } - - @Test - public void testDeletion() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); - - writer.add(makeHomRef("20", 1, 0)); - writer.add(makeHomRef("20", 2, 0)); - writer.add(makeDeletion("20", 3, 3)); - writer.add(makeHomRef("20", 4, 0)); - writer.add(makeHomRef("20", 5, 0)); - writer.add(makeHomRef("20", 6, 0)); - writer.add(makeHomRef("20", 7, 0)); - writer.close(); - Assert.assertEquals(mockWriter.emitted.size(), 3); - assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false); - assertGoodVC(mockWriter.emitted.get(1), "20", 3, 5, true); - assertGoodVC(mockWriter.emitted.get(2), "20", 6, 7, false); - } - - @Test - public void testHomRefAlt() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); - - writer.add(makeHomRef("20", 1, 0)); - writer.add(makeHomRef("20", 2, 0)); - writer.add(makeHomRefAlt("20", 3, 0)); - writer.add(makeHomRef("20", 4, 0)); - writer.add(makeHomRef("20", 5, 0)); - writer.add(makeHomRef("20", 6, 0)); - writer.add(makeHomRef("20", 7, 0)); - writer.close(); - Assert.assertEquals(mockWriter.emitted.size(), 3); - assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false); - Assert.assertFalse(mockWriter.emitted.get(1).hasAttribute("END")); - Assert.assertFalse(mockWriter.emitted.get(1).hasAttribute("BLOCK_SIZE")); - assertGoodVC(mockWriter.emitted.get(2), "20", 4, 7, false); - } - - @DataProvider(name = "BandPartitionData") - public Object[][] makeBandPartitionData() { - List tests = new ArrayList<>(); - - tests.add(new Object[]{null, false}); - tests.add(new Object[]{Collections.emptyList(), false}); - tests.add(new Object[]{Arrays.asList(1), true}); - tests.add(new Object[]{Arrays.asList(1, 10), true}); - tests.add(new Object[]{Arrays.asList(1, 10, 30), true}); - tests.add(new Object[]{Arrays.asList(10, 1, 30), false}); - tests.add(new Object[]{Arrays.asList(-1, 1), false}); - tests.add(new Object[]{Arrays.asList(1, null, 10), false}); - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "BandPartitionData") - public void testMyData(final List partitions, final boolean expectedGood) { - try { - GVCFWriter.parsePartitions(partitions); - Assert.assertTrue(expectedGood, "Expected to fail but didn't"); - } catch ( Exception e ) { - Assert.assertTrue(! expectedGood, "Expected to succeed but failed with message " + e.getMessage()); - } - } -} diff --git a/protected/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java deleted file mode 100644 index 337f23afe..000000000 --- a/protected/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java +++ /dev/null @@ -1,99 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.utils.nanoScheduler; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -// ********************************************************************************** // -// Note that this class also serves as an integration test for the VariantAnnotator! // -// ********************************************************************************** // - -public class NanoSchedulerIntegrationTest extends WalkerTest { - @DataProvider(name = "NanoSchedulerUGTest") - public Object[][] createNanoSchedulerUGTest() { - List tests = new ArrayList(); - - for ( final int nt : Arrays.asList(1, 2) ) - for ( final int nct : Arrays.asList(1, 2) ) { -// tests.add(new Object[]{ "SNP", "a1c7546f32a8919a3f3a70a04b2e8322", nt, nct }); -//// tests.add(new Object[]{ "INDEL", "0a6d2be79f4f8a4b0eb788cc4751b31b", nt, nct }); - tests.add(new Object[]{ "BOTH", "a80925b58735828158491f77ae64998b", nt, nct }); - } - - return tests.toArray(new Object[][]{}); - } - - @Test(enabled = true, dataProvider = "NanoSchedulerUGTest") - private void testNanoSchedulerUGTest(final String glm, final String md5, final int nt, final int nct ) { - WalkerTestSpec spec = new WalkerTestSpec( - buildCommandLine( - "-T UnifiedGenotyper -R " + b37KGReference, - "--no_cmdline_in_header -G", - //"--dbsnp " + b37dbSNP132, - "-I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", - "-L 20:10,000,000-10,100,000", - "-glm " + glm, - "--contamination_fraction_to_filter 0.0", - "-nt " + nt, - "-nct " + nct, - "-o %s" - ), - 1, - Arrays.asList(md5) - ); - executeTest(String.format("testUG-glm:%s-nt%d-nct%d", glm, nt, nct), spec); - } - - - -} diff --git a/protected/java/test/org/broadinstitute/sting/utils/recalibration/ContextCovariateUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/recalibration/ContextCovariateUnitTest.java deleted file mode 100644 index 2d3d680df..000000000 --- a/protected/java/test/org/broadinstitute/sting/utils/recalibration/ContextCovariateUnitTest.java +++ /dev/null @@ -1,112 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.utils.recalibration; - -import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.sting.utils.recalibration.covariates.ContextCovariate; -import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; -import org.broadinstitute.sting.utils.clipping.ClippingRepresentation; -import org.broadinstitute.sting.utils.clipping.ReadClipper; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - -/** - * @author Mauricio Carneiro - * @since 3/1/12 - */ -public class ContextCovariateUnitTest { - ContextCovariate covariate; - RecalibrationArgumentCollection RAC; - - @BeforeClass - public void init() { - RAC = new RecalibrationArgumentCollection(); - covariate = new ContextCovariate(); - covariate.initialize(RAC); - - } - - @Test(enabled = true) - public void testSimpleContexts() { - GATKSAMRecord read = ReadUtils.createRandomRead(1000); - GATKSAMRecord clippedRead = ReadClipper.clipLowQualEnds(read, RAC.LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS); - ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); - covariate.recordValues(read, readCovariates); - - verifyCovariateArray(readCovariates.getMismatchesKeySet(), RAC.MISMATCHES_CONTEXT_SIZE, clippedRead, covariate); - verifyCovariateArray(readCovariates.getInsertionsKeySet(), RAC.INDELS_CONTEXT_SIZE, clippedRead, covariate); - verifyCovariateArray(readCovariates.getDeletionsKeySet(), RAC.INDELS_CONTEXT_SIZE, clippedRead, covariate); - } - - public static void verifyCovariateArray(int[][] values, int contextSize, GATKSAMRecord read, Covariate contextCovariate) { - for (int i = 0; i < values.length; i++) - Assert.assertEquals(contextCovariate.formatKey(values[i][0]), expectedContext(read, i, contextSize)); - - } - - public static String expectedContext (GATKSAMRecord read, int offset, int contextSize) { - final String bases = stringFrom(read.getReadBases()); - String expectedContext = null; - if (offset - contextSize + 1 >= 0) { - String context = bases.substring(offset - contextSize + 1, offset + 1); - if (!context.contains("N")) - expectedContext = context; - } - return expectedContext; - } - - private static String stringFrom(byte[] array) { - String s = ""; - for (byte value : array) - s += (char) value; - return s; - } - -} diff --git a/protected/java/test/org/broadinstitute/sting/utils/recalibration/CycleCovariateUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/recalibration/CycleCovariateUnitTest.java deleted file mode 100644 index ce827065b..000000000 --- a/protected/java/test/org/broadinstitute/sting/utils/recalibration/CycleCovariateUnitTest.java +++ /dev/null @@ -1,130 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.utils.recalibration; - -import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.recalibration.covariates.CycleCovariate; -import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - -/** - * @author Mauricio Carneiro - * @since 3/1/12 - */ -public class CycleCovariateUnitTest { - CycleCovariate covariate; - RecalibrationArgumentCollection RAC; - - @BeforeClass - public void init() { - RAC = new RecalibrationArgumentCollection(); - covariate = new CycleCovariate(); - covariate.initialize(RAC); - } - - @Test(enabled = true) - public void testSimpleCycles() { - short readLength = 10; - GATKSAMRecord read = ReadUtils.createRandomRead(readLength); - read.setReadPairedFlag(true); - read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID")); - read.getReadGroup().setPlatform("illumina"); - - ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); - covariate.recordValues(read, readCovariates); - verifyCovariateArray(readCovariates.getMismatchesKeySet(), 1, (short) 1); - - read.setReadNegativeStrandFlag(true); - covariate.recordValues(read, readCovariates); - verifyCovariateArray(readCovariates.getMismatchesKeySet(), readLength, -1); - - read.setSecondOfPairFlag(true); - covariate.recordValues(read, readCovariates); - verifyCovariateArray(readCovariates.getMismatchesKeySet(), -readLength, 1); - - read.setReadNegativeStrandFlag(false); - covariate.recordValues(read, readCovariates); - verifyCovariateArray(readCovariates.getMismatchesKeySet(), -1, -1); - } - - private void verifyCovariateArray(int[][] values, int init, int increment) { - for (short i = 0; i < values.length; i++) { - short actual = Short.decode(covariate.formatKey(values[i][0])); - int expected = init + (increment * i); - Assert.assertEquals(actual, expected); - } - } - - @Test(enabled = true, expectedExceptions={UserException.class}) - public void testMoreThanMaxCycleFails() { - int readLength = RAC.MAXIMUM_CYCLE_VALUE + 1; - GATKSAMRecord read = ReadUtils.createRandomRead(readLength); - read.setReadPairedFlag(true); - read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID")); - read.getReadGroup().setPlatform("illumina"); - - ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); - covariate.recordValues(read, readCovariates); - } - - @Test(enabled = true) - public void testMaxCyclePasses() { - int readLength = RAC.MAXIMUM_CYCLE_VALUE; - GATKSAMRecord read = ReadUtils.createRandomRead(readLength); - read.setReadPairedFlag(true); - read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID")); - read.getReadGroup().setPlatform("illumina"); - - ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); - covariate.recordValues(read, readCovariates); - } -} diff --git a/protected/java/test/org/broadinstitute/sting/utils/recalibration/ReadCovariatesUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/recalibration/ReadCovariatesUnitTest.java deleted file mode 100644 index f20d6116b..000000000 --- a/protected/java/test/org/broadinstitute/sting/utils/recalibration/ReadCovariatesUnitTest.java +++ /dev/null @@ -1,137 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.utils.recalibration; - -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.sting.utils.recalibration.covariates.*; -import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.testng.Assert; -import org.testng.annotations.Test; - -import java.util.Random; - -/** - * @author carneiro - * @since 4/21/12 - */ -public class ReadCovariatesUnitTest { - - @Test(enabled = false) - public void testCovariateGeneration() { - final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); - final String RGID = "id"; - - ReadGroupCovariate rgCov = new ReadGroupCovariate(); - QualityScoreCovariate qsCov = new QualityScoreCovariate(); - ContextCovariate coCov = new ContextCovariate(); - CycleCovariate cyCov = new CycleCovariate(); - - rgCov.initialize(RAC); - qsCov.initialize(RAC); - coCov.initialize(RAC); - cyCov.initialize(RAC); - - Covariate[] requestedCovariates = new Covariate[4]; - requestedCovariates[0] = rgCov; - requestedCovariates[1] = qsCov; - requestedCovariates[2] = coCov; - requestedCovariates[3] = cyCov; - - final int NUM_READS = 100; - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); - - final String[] readGroups = {"RG1", "RG2", "RGbla"}; - for (int idx = 0; idx < NUM_READS; idx++) { - for (final String rgs : readGroups) { - final int length = 10 + rnd.nextInt(100); // random read length, at least 10 bp long - final GATKSAMRecord read = ReadUtils.createRandomRead(length, false); - final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(rgs); - rg.setPlatform("illumina"); - read.setReadGroup(rg); - read.setReadNegativeStrandFlag(rnd.nextBoolean()); - final byte[] mQuals = read.getBaseQualities(EventType.BASE_SUBSTITUTION); - final byte[] iQuals = read.getBaseQualities(EventType.BASE_INSERTION); - final byte[] dQuals = read.getBaseQualities(EventType.BASE_DELETION); - ReadCovariates rc = RecalUtils.computeCovariates(read, requestedCovariates); - - // check that the length is correct - Assert.assertEquals(rc.getMismatchesKeySet().length, length); - Assert.assertEquals(rc.getInsertionsKeySet().length, length); - Assert.assertEquals(rc.getDeletionsKeySet().length, length); - - for (int i = 0; i < length; i++) { - // check that read group is always the same - Assert.assertEquals(rgCov.formatKey(rc.getMismatchesKeySet(i)[0]), rgs); - Assert.assertEquals(rgCov.formatKey(rc.getInsertionsKeySet(i)[0]), rgs); - Assert.assertEquals(rgCov.formatKey(rc.getDeletionsKeySet(i)[0]), rgs); - - // check quality score - Assert.assertEquals(qsCov.formatKey(rc.getMismatchesKeySet(i)[1]), "" + mQuals[i]); - Assert.assertEquals(qsCov.formatKey(rc.getInsertionsKeySet(i)[1]), "" + iQuals[i]); - Assert.assertEquals(qsCov.formatKey(rc.getDeletionsKeySet(i)[1]), "" + dQuals[i]); - - // check context - Assert.assertEquals(coCov.formatKey(rc.getMismatchesKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.MISMATCHES_CONTEXT_SIZE)); - Assert.assertEquals(coCov.formatKey(rc.getInsertionsKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.INDELS_CONTEXT_SIZE)); - Assert.assertEquals(coCov.formatKey(rc.getDeletionsKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.INDELS_CONTEXT_SIZE)); - - // check cycle - Assert.assertEquals(cyCov.formatKey(rc.getMismatchesKeySet(i)[3]), "" + (i+1)); - Assert.assertEquals(cyCov.formatKey(rc.getInsertionsKeySet(i)[3]), "" + (i+1)); - Assert.assertEquals(cyCov.formatKey(rc.getDeletionsKeySet(i)[3]), "" + (i+1)); - } - - } - - } - - } - -} diff --git a/protected/java/test/org/broadinstitute/sting/utils/recalibration/ReadGroupCovariateUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/recalibration/ReadGroupCovariateUnitTest.java deleted file mode 100644 index 0b2df6369..000000000 --- a/protected/java/test/org/broadinstitute/sting/utils/recalibration/ReadGroupCovariateUnitTest.java +++ /dev/null @@ -1,115 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.utils.recalibration; - -import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.sting.utils.recalibration.covariates.ReadGroupCovariate; -import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - -/** - * @author Mauricio Carneiro - * @since 3/1/12 - */ -public class ReadGroupCovariateUnitTest { - ReadGroupCovariate covariate; - RecalibrationArgumentCollection RAC; - - @BeforeClass - public void init() { - RAC = new RecalibrationArgumentCollection(); - covariate = new ReadGroupCovariate(); - covariate.initialize(RAC); - } - - @Test(enabled = true) - public void testSingleRecord() { - final String expected = "SAMPLE.1"; - GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("MY.ID"); - rg.setPlatformUnit(expected); - runTest(rg, expected, covariate); - } - - @Test(enabled = true) - public void testMissingPlatformUnit() { - final String expected = "MY.7"; - GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(expected); - runTest(rg, expected, covariate); - } - - @Test(enabled = true) - public void testForceReadgroup() { - final RecalibrationArgumentCollection forcedRAC = new RecalibrationArgumentCollection(); - forcedRAC.FORCE_READGROUP = "FOO"; - final ReadGroupCovariate forcedCovariate = new ReadGroupCovariate(); - forcedCovariate.initialize(forcedRAC); - - final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("NOT_FOO"); - runTest(rg, "FOO", forcedCovariate); - } - - private static void runTest(final GATKSAMReadGroupRecord rg, final String expected, final ReadGroupCovariate covariate) { - GATKSAMRecord read = ReadUtils.createRandomRead(10); - read.setReadGroup(rg); - ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); - covariate.recordValues(read, readCovariates); - verifyCovariateArray(readCovariates.getMismatchesKeySet(), expected, covariate); - - } - - private static void verifyCovariateArray(final int[][] values, final String expected, final ReadGroupCovariate covariate) { - for (int[] value : values) { - String actual = covariate.formatKey(value[0]); - Assert.assertEquals(actual, expected); - } - } - -} diff --git a/protected/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java deleted file mode 100644 index 7d1e51385..000000000 --- a/protected/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java +++ /dev/null @@ -1,165 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.utils.recalibration; - -import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.sting.utils.recalibration.covariates.*; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.collections.NestedIntegerArray; -import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.testng.Assert; -import org.testng.annotations.Test; - -import java.util.*; - -/** - * @author carneiro - * @since 4/21/12 - */ -public class RecalibrationReportUnitTest { - private static RecalDatum createRandomRecalDatum(int maxObservations, int maxErrors) { - final Random random = new Random(); - final int nObservations = random.nextInt(maxObservations); - final int nErrors = Math.min(random.nextInt(maxErrors), nObservations); - final int qual = random.nextInt(QualityUtils.MAX_SAM_QUAL_SCORE); - return new RecalDatum((long)nObservations, (double)nErrors, (byte)qual); - } - - @Test(enabled = true) - public void testOutput() { - final int length = 100; - - List quals = new ArrayList(QualityUtils.MAX_SAM_QUAL_SCORE + 1); - List counts = new ArrayList(QualityUtils.MAX_SAM_QUAL_SCORE + 1); - - for (int i = 0; i<= QualityUtils.MAX_SAM_QUAL_SCORE; i++) { - quals.add((byte) i); - counts.add(1L); - } - - final QuantizationInfo quantizationInfo = new QuantizationInfo(quals, counts); - final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); - - quantizationInfo.noQuantization(); - final List requiredCovariates = new LinkedList(); - final List optionalCovariates = new LinkedList(); - - final ReadGroupCovariate rgCovariate = new ReadGroupCovariate(); - rgCovariate.initialize(RAC); - requiredCovariates.add(rgCovariate); - - final QualityScoreCovariate qsCovariate = new QualityScoreCovariate(); - qsCovariate.initialize(RAC); - requiredCovariates.add(qsCovariate); - - final ContextCovariate cxCovariate = new ContextCovariate(); - cxCovariate.initialize(RAC); - optionalCovariates.add(cxCovariate); - final CycleCovariate cyCovariate = new CycleCovariate(); - cyCovariate.initialize(RAC); - optionalCovariates.add(cyCovariate); - - final Covariate[] requestedCovariates = new Covariate[requiredCovariates.size() + optionalCovariates.size()]; - int covariateIndex = 0; - for (final Covariate cov : requiredCovariates) - requestedCovariates[covariateIndex++] = cov; - for (final Covariate cov : optionalCovariates) - requestedCovariates[covariateIndex++] = cov; - - final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("id"); - rg.setPlatform("illumina"); - final GATKSAMRecord read = ReadUtils.createRandomRead(length, false); - read.setReadGroup(rg); - final byte [] readQuals = new byte[length]; - for (int i = 0; i < length; i++) - readQuals[i] = 20; - read.setBaseQualities(readQuals); - - final int expectedKeys = expectedNumberOfKeys(length, RAC.INDELS_CONTEXT_SIZE, RAC.MISMATCHES_CONTEXT_SIZE); - int nKeys = 0; // keep track of how many keys were produced - final ReadCovariates rc = RecalUtils.computeCovariates(read, requestedCovariates); - - final RecalibrationTables recalibrationTables = new RecalibrationTables(requestedCovariates); - final NestedIntegerArray rgTable = recalibrationTables.getReadGroupTable(); - final NestedIntegerArray qualTable = recalibrationTables.getQualityScoreTable(); - - for (int offset = 0; offset < length; offset++) { - - for (EventType errorMode : EventType.values()) { - - final int[] covariates = rc.getKeySet(offset, errorMode); - final int randomMax = errorMode == EventType.BASE_SUBSTITUTION ? 10000 : 100000; - - rgTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], errorMode.ordinal()); - qualTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], covariates[1], errorMode.ordinal()); - nKeys += 2; - for (int j = 0; j < optionalCovariates.size(); j++) { - final NestedIntegerArray covTable = recalibrationTables.getTable(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal() + j); - final int covValue = covariates[RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal() + j]; - if ( covValue >= 0 ) { - covTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], covariates[1], covValue, errorMode.ordinal()); - nKeys++; - } - } - } - } - Assert.assertEquals(nKeys, expectedKeys); - } - - private static int expectedNumberOfKeys (int readLength, int indelContextSize, int mismatchesContextSize) { - final int numCovariates = 4; - final int numTables = 3; - final int mismatchContextPadding = mismatchesContextSize - 1; - final int indelContextPadding = 2 * (indelContextSize - 1); - final int indelCyclePadding = 2 * (2 * CycleCovariate.CUSHION_FOR_INDELS); - - return (numCovariates * numTables * readLength) - mismatchContextPadding - indelContextPadding - indelCyclePadding; - } - -} diff --git a/protected/java/test/org/broadinstitute/sting/utils/recalibration/RepeatCovariatesUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/recalibration/RepeatCovariatesUnitTest.java deleted file mode 100644 index ea70deeea..000000000 --- a/protected/java/test/org/broadinstitute/sting/utils/recalibration/RepeatCovariatesUnitTest.java +++ /dev/null @@ -1,239 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.utils.recalibration; - -import com.google.java.contract.Requires; -import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.sting.utils.recalibration.covariates.*; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.collections.Pair; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Random; - -public class RepeatCovariatesUnitTest { - - RepeatLengthCovariate rlCovariate; - RepeatUnitCovariate ruCovariate; - RepeatUnitAndLengthCovariate rurlCovariate; - RecalibrationArgumentCollection RAC; - - - - @BeforeClass - public void init() { - RAC = new RecalibrationArgumentCollection(); - rlCovariate = new RepeatLengthCovariate(); - ruCovariate = new RepeatUnitCovariate(); - rurlCovariate = new RepeatUnitAndLengthCovariate(); - rlCovariate.initialize(RAC); - ruCovariate.initialize(RAC); - rurlCovariate.initialize(RAC); - } - - - @Test(enabled = true) - public void testFindNumberOfRepetitions() { - // First, test logic to compute number of repetitions of a substring on a given string. - int result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACAC".getBytes(), true); - Assert.assertEquals(2,result); - result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACACACAC".getBytes(), true); - Assert.assertEquals(4,result); - result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACACACACGT".getBytes(), true); - Assert.assertEquals(4,result); - result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "GTACACACAC".getBytes(), true); - Assert.assertEquals(0,result); - result = GATKVariantContextUtils.findNumberofRepetitions("GCA".getBytes(), "GTAGGGT".getBytes(), true); - Assert.assertEquals(0,result); - result = GATKVariantContextUtils.findNumberofRepetitions("GCAGCA".getBytes(), "GCAGCAGTAGGGTGTACACACAC".getBytes(), true); - Assert.assertEquals(1,result); - result = GATKVariantContextUtils.findNumberofRepetitions("GCAGCA".getBytes(), "GTAGGGTGTACACACACGCAGCAT".getBytes(), true); - Assert.assertEquals(0,result); - result = GATKVariantContextUtils.findNumberofRepetitions("GCA".getBytes(), "GTAGGGTGTACACACACGCAGCAGCA".getBytes(), true); - Assert.assertEquals(0,result); - // Same tests but looking backward on string - result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACAC".getBytes(), false); - Assert.assertEquals(2,result); - result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACACACAC".getBytes(), false); - Assert.assertEquals(4,result); - result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACACACACGT".getBytes(), false); - Assert.assertEquals(0,result); - result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "GTACACACAC".getBytes(), false); - Assert.assertEquals(4,result); - result = GATKVariantContextUtils.findNumberofRepetitions("GCA".getBytes(), "GTAGGGT".getBytes(), false); - Assert.assertEquals(0,result); - result = GATKVariantContextUtils.findNumberofRepetitions("GCAGCA".getBytes(), "GCAGCAGTAGGGTGTACACACAC".getBytes(), false); - Assert.assertEquals(0,result); - result = GATKVariantContextUtils.findNumberofRepetitions("GCAGCA".getBytes(), "GTAGGGTGTACACACACGCAGCAT".getBytes(), false); - Assert.assertEquals(0,result); - result = GATKVariantContextUtils.findNumberofRepetitions("GCA".getBytes(), "GTAGGGTGTACACACACGCAGCAGCA".getBytes(), false); - Assert.assertEquals(3,result); - - // test logic to get repeat unit and number of repeats from covariate value - final String[] repUnits = new String[]{"AG","CCG","TCCA","T"}; - for (String ru : repUnits) { - for (int k=1; k < 10; k++) { - Pair pair = RepeatLengthCovariate.getRUandNRfromCovariate(String.format("%s%d",ru,k)); - Assert.assertEquals(pair.second.intValue(),k); - Assert.assertEquals(pair.first,ru); - } - } - - } - - /** - * Build synthetic reads with random content made up of tandem repeats, record computed Repeat Unit and # repeats and see if - * they match with read context - */ - @Test(enabled = true) - public void testManyObservations() { - final int NUM_UNITS = 10; - final int MAX_REPEAT_UNIT_LENGTH = RAC.MAX_STR_UNIT_LENGTH; - final int MAX_NUM_REPETITIONS = RAC.MAX_REPEAT_LENGTH; - final int NUM_TEST_CASES = 100; - - Random random = new Random(); - - for (int r = 0; r < NUM_TEST_CASES; r++) { - final StringBuilder sb = new StringBuilder(); - // for each unit, generate a repeat unit at random with given random length - final ArrayList repeatUnits = new ArrayList(); - final ArrayList numsRepetitions = new ArrayList(); - for (int n=0; n < NUM_UNITS; n++) { - final int repLength = 1+random.nextInt(MAX_REPEAT_UNIT_LENGTH); - final String repeatUnit = getRandomBases(repLength); - final int numRepetitions = 1+random.nextInt(MAX_NUM_REPETITIONS); - - // log for comparison with covariate - numsRepetitions.add(numRepetitions); - repeatUnits.add(repeatUnit); - - for (int k=0; k < numRepetitions; k++) - sb.append(repeatUnit); - - } - - final String readBases = sb.toString(); - System.out.println(readBases); - final int readLength = readBases.length(); - - final byte[] readQuals = new byte[readLength]; - Arrays.fill(readQuals,(byte)30); - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(readBases.getBytes(),readQuals,readLength+"M"); - - Covariate[] requestedCovariates = new Covariate[3]; - requestedCovariates[0] = rlCovariate; - requestedCovariates[1] = ruCovariate; - requestedCovariates[2] = rurlCovariate; - ReadCovariates rc = RecalUtils.computeCovariates(read, requestedCovariates); - - // check that the length is correct - Assert.assertEquals(rc.getMismatchesKeySet().length, readLength); - Assert.assertEquals(rc.getInsertionsKeySet().length, readLength); - Assert.assertEquals(rc.getDeletionsKeySet().length, readLength); - - for (int offset = 0; offset < readBases.length(); offset++) { // recalibrate all bases in the read - // check RepeatLength - final String rlValM = rlCovariate.formatKey(rc.getMismatchesKeySet(offset)[0]); - final String rlValI = rlCovariate.formatKey(rc.getInsertionsKeySet(offset)[0]); - final String rlValD = rlCovariate.formatKey(rc.getDeletionsKeySet(offset)[0]); - // check RepeatUnit - final String ruValM = ruCovariate.formatKey(rc.getMismatchesKeySet(offset)[1]); - final String ruValI = ruCovariate.formatKey(rc.getInsertionsKeySet(offset)[1]); - final String ruValD = ruCovariate.formatKey(rc.getDeletionsKeySet(offset)[1]); - // check RepeatUnitAndLength - final String rurlValM = rurlCovariate.formatKey(rc.getMismatchesKeySet(offset)[2]); - final String rurlValI = rurlCovariate.formatKey(rc.getInsertionsKeySet(offset)[2]); - final String rurlValD = rurlCovariate.formatKey(rc.getDeletionsKeySet(offset)[2]); - // check all 3 values are identical - Assert.assertEquals(rlValD,rlValI); - Assert.assertEquals(rlValM,rlValI); - Assert.assertEquals(ruValD,ruValI); - Assert.assertEquals(ruValM,ruValI); - Assert.assertEquals(rurlValD,rurlValI); - Assert.assertEquals(rurlValM,rurlValI); - - - int fw = GATKVariantContextUtils.findNumberofRepetitions(ruValM.getBytes(), readBases.substring(offset+1,readLength).getBytes(),true); - int bw = GATKVariantContextUtils.findNumberofRepetitions(ruValM.getBytes(), readBases.substring(0,offset+1).getBytes(),false); - Assert.assertEquals(Math.min(fw+bw,RAC.MAX_REPEAT_LENGTH),(int)Integer.valueOf(rlValM)); - } - - } - - - - - - - } - - /** - * Returns random bases of given length - * @param length required length - * @return given random string - */ - @Requires("length > 0") - String getRandomBases(final int length) { - byte[] bases = new byte[length]; - Random ran = new Random(); - for (int i=0; i < length; i++ ) { - int idx = ran.nextInt(4); - bases[i] = BaseUtils.baseIndexToSimpleBase(idx); - } - return new String(bases); - } - - -} diff --git a/protected/pom.xml b/protected/pom.xml new file mode 100644 index 000000000..4b165d477 --- /dev/null +++ b/protected/pom.xml @@ -0,0 +1,24 @@ + + + 4.0.0 + + + org.broadinstitute.sting + sting-root + 2.8-SNAPSHOT + ../public/sting-root + + + sting-protected + pom + Sting Protected + + + gatk-protected + + + + ${project.basedir}/.. + + + diff --git a/public/external-example/pom.xml b/public/external-example/pom.xml new file mode 100644 index 000000000..bdfeb099f --- /dev/null +++ b/public/external-example/pom.xml @@ -0,0 +1,267 @@ + + 4.0.0 + + + org.mycompany.app + external-example + jar + 1.0-SNAPSHOT + GATK External Example + + + 2.8-SNAPSHOT + + ../.. + UTF-8 + UTF-8 + yyyy/MM/dd HH:mm:ss + + + true + ${sting.committests.skipped} + ${sting.committests.skipped} + + + package + + + + + sting.public.repo.local + Sting Public Local Repository + file:${sting.basedir}/public/repo + + + + + + org.broadinstitute.sting + gatk-framework + ${sting.version} + + + + org.broadinstitute.sting + gatk-framework + ${sting.version} + test-jar + test + + + + org.testng + testng + 6.8 + test + + + + + + + + + org.apache.maven.plugins + maven-dependency-plugin + 2.8 + + + unpack + process-resources + + unpack + + + + + org.broadinstitute.sting + gatk-framework + ${sting.version} + example-resources + tar.bz2 + ${project.build.outputDirectory} + + + + + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + 2.9.1 + + + extract-resource-bundle + + javadoc + + prepare-package + + org.broadinstitute.sting.utils.help.ResourceBundleExtractorDoclet + + ${project.build.outputDirectory} + + org.broadinstitute.sting + + gatk-framework + ${sting.version} + + 2g + false + true + -build-timestamp "${maven.build.timestamp}" -absolute-version "${project.version}" -out ${project.build.outputDirectory}/StingText.properties + + + + + + + + org.apache.maven.plugins + maven-shade-plugin + 2.1 + + + ${sting.shade.phase} + + shade + + + true + + + + commons-logging:commons-logging + + ** + + + + org.broad:tribble + + ** + + + + org.broadinstitute:variant + + ** + + + + + + + org.broadinstitute.sting:gsalib:tar.gz:* + org.broadinstitute.sting:*:tar.bz2:example-resources + + + + + + org.broadinstitute.sting.gatk.CommandLineGATK + + + + StingText.properties + + + + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + 2.16 + + + true + false + + + ${java.io.tmpdir} + + + + + + default-test + none + + + unit-tests + + test + + + ${sting.unittests.skipped} + + **/*UnitTest.class + + + + + + + + + org.apache.maven.plugins + maven-failsafe-plugin + 2.16 + + + true + false + + + ${java.io.tmpdir} + + + + + integration-tests + + integration-test + verify + + + + ${sting.integrationtests.skipped} + + **/*IntegrationTest.class + + + + + + + + + + + packagetests-enabled + + + sting.packagetests.enabled + true + + + + none + + + + + diff --git a/public/external-example/src/main/java/org/mycompany/app/MyExampleWalker.java b/public/external-example/src/main/java/org/mycompany/app/MyExampleWalker.java new file mode 100644 index 000000000..d65c47c99 --- /dev/null +++ b/public/external-example/src/main/java/org/mycompany/app/MyExampleWalker.java @@ -0,0 +1,56 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.mycompany.app; + +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; + +import java.io.PrintStream; + +/** + * An example walker that looks surprisingly like CountLoci. + */ +public class MyExampleWalker extends LocusWalker { + @Output + PrintStream out; + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + return 1; + } + + public Long reduceInit() { return 0l; } + + public Long reduce(Integer value, Long sum) { + return value + sum; + } + + public void onTraversalDone( Long c ) { + out.println(c); + } +} diff --git a/public/external-example/src/test/java/org/mycompany/app/MyExampleWalkerIntegrationTest.java b/public/external-example/src/test/java/org/mycompany/app/MyExampleWalkerIntegrationTest.java new file mode 100644 index 000000000..777079426 --- /dev/null +++ b/public/external-example/src/test/java/org/mycompany/app/MyExampleWalkerIntegrationTest.java @@ -0,0 +1,54 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.mycompany.app; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.io.File; +import java.net.URISyntaxException; +import java.net.URL; +import java.util.Collections; +import java.util.MissingResourceException; + +/** + * NOTE: Currently the testing infrastructure for walkers does not support running outside the Broad. + */ +public class MyExampleWalkerIntegrationTest extends WalkerTest { + @Test + public void testMyExampleWalker() throws URISyntaxException { + String gatk_args = String.format("-T MyExampleWalker -I %s -R %s", getResource("/exampleBAM.bam"), getResource("/exampleFASTA.fasta")); + WalkerTestSpec spec = new WalkerTestSpec(gatk_args, Collections.emptyList()); + executeTest("Testing count on the example bam", spec); + } + + private File getResource(String path) throws URISyntaxException { + URL resourceUrl = getClass().getResource(path); + if (resourceUrl == null) + throw new MissingResourceException("Resource not found: " + path, getClass().getSimpleName(), path); + return new File(resourceUrl.toURI()); + } +} diff --git a/public/external-example/src/test/java/org/mycompany/app/MyExampleWalkerUnitTest.java b/public/external-example/src/test/java/org/mycompany/app/MyExampleWalkerUnitTest.java new file mode 100644 index 000000000..e3e0c81ea --- /dev/null +++ b/public/external-example/src/test/java/org/mycompany/app/MyExampleWalkerUnitTest.java @@ -0,0 +1,41 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.mycompany.app; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.Test; + +/** + * NOTE: Currently the testing infrastructure for walkers does not support running outside the Broad. + */ +public class MyExampleWalkerUnitTest extends BaseTest { + @Test + public void testMyExampleWalker() { + MyExampleWalker walker = new MyExampleWalker(); + Assert.assertEquals((long)walker.reduce(1, 1L), 2L); + } +} diff --git a/public/gatk-framework/pom.xml b/public/gatk-framework/pom.xml new file mode 100644 index 000000000..6b82c8618 --- /dev/null +++ b/public/gatk-framework/pom.xml @@ -0,0 +1,126 @@ + + + 4.0.0 + + + org.broadinstitute.sting + sting-aggregator + 2.8-SNAPSHOT + ../.. + + + gatk-framework + jar + GATK Framework + + + ${project.basedir}/../.. + gatk-package + + + + + ${project.groupId} + sting-utils + ${project.version} + + + + org.testng + testng + test + + + com.google.caliper + caliper + test + + + + + + + org.apache.maven.plugins + maven-assembly-plugin + + + example-resources + ${sting.generate-resources.phase} + + + + + org.apache.maven.plugins + maven-resources-plugin + + + copy-resource-bundle-log4j + prepare-package + + + + + org.apache.maven.plugins + maven-javadoc-plugin + + + extract-resource-bundle + prepare-package + + + + + org.apache.maven.plugins + maven-invoker-plugin + + + package-unittests + + + package-integrationtests + + + package-largescaletests + + + package-knowledgebasetests + + + package-pipelinetests + + + + + + + + + private + + + ${basedir}/../../private/gatk-private/pom.xml + + + + + + + com.pyx4j + maven-junction-plugin + + + link-private-testdata + process-test-resources + + + unlink-private-testdata + clean + + + + + + + + + diff --git a/public/gatk-framework/src/main/assembly/example-resources.xml b/public/gatk-framework/src/main/assembly/example-resources.xml new file mode 100644 index 000000000..b285cc05f --- /dev/null +++ b/public/gatk-framework/src/main/assembly/example-resources.xml @@ -0,0 +1,37 @@ + + example-resources + + tar.bz2 + + false + + + ${project.build.sourceDirectory}/org/broadinstitute/sting/gatk/walkers/qc + . + + Pileup.java + CountLoci.java + CountReads.java + CheckPileup.java + + + + ${project.build.sourceDirectory}/org/broadinstitute/sting/gatk/walkers/readutils + . + + PrintReads.java + + + + src/test/resources + . + + exampleBAM.bam + exampleBAM.bam.bai + exampleFASTA.fasta + exampleFASTA.fasta.fai + exampleFASTA.dict + + + + diff --git a/public/java/src/net/sf/samtools/GATKBAMFileSpan.java b/public/gatk-framework/src/main/java/net/sf/samtools/GATKBAMFileSpan.java similarity index 100% rename from public/java/src/net/sf/samtools/GATKBAMFileSpan.java rename to public/gatk-framework/src/main/java/net/sf/samtools/GATKBAMFileSpan.java diff --git a/public/java/src/net/sf/samtools/GATKBin.java b/public/gatk-framework/src/main/java/net/sf/samtools/GATKBin.java similarity index 100% rename from public/java/src/net/sf/samtools/GATKBin.java rename to public/gatk-framework/src/main/java/net/sf/samtools/GATKBin.java diff --git a/public/java/src/net/sf/samtools/GATKChunk.java b/public/gatk-framework/src/main/java/net/sf/samtools/GATKChunk.java similarity index 100% rename from public/java/src/net/sf/samtools/GATKChunk.java rename to public/gatk-framework/src/main/java/net/sf/samtools/GATKChunk.java diff --git a/public/java/src/net/sf/samtools/PicardNamespaceUtils.java b/public/gatk-framework/src/main/java/net/sf/samtools/PicardNamespaceUtils.java similarity index 100% rename from public/java/src/net/sf/samtools/PicardNamespaceUtils.java rename to public/gatk-framework/src/main/java/net/sf/samtools/PicardNamespaceUtils.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/Aligner.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/Aligner.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/Aligner.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/Aligner.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/Alignment.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/Alignment.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/Alignment.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/Alignment.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/CheckAlignment.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/CheckAlignment.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/CheckAlignment.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/CheckAlignment.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/BWAAligner.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/BWAAligner.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/bwa/BWAAligner.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/BWAAligner.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/BWAConfiguration.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/BWAConfiguration.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/bwa/BWAConfiguration.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/BWAConfiguration.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/BWTFiles.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/BWTFiles.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/bwa/BWTFiles.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/BWTFiles.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/c/BWACAligner.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/c/BWACAligner.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/bwa/c/BWACAligner.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/c/BWACAligner.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/c/BWAPath.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/c/BWAPath.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/bwa/c/BWAPath.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/c/BWAPath.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignerTestHarness.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/java/AlignerTestHarness.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignerTestHarness.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/java/AlignerTestHarness.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignmentMatchSequence.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/java/AlignmentMatchSequence.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignmentMatchSequence.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/java/AlignmentMatchSequence.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignmentState.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/java/AlignmentState.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignmentState.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/java/AlignmentState.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/java/BWAAlignment.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/java/BWAAlignment.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/bwa/java/BWAAlignment.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/java/BWAAlignment.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/java/BWAJavaAligner.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/java/BWAJavaAligner.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/bwa/java/BWAJavaAligner.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/java/BWAJavaAligner.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/java/LowerBound.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/java/LowerBound.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/bwa/java/LowerBound.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/bwa/java/LowerBound.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/package-info.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/package-info.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/package-info.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/package-info.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/AMBWriter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/AMBWriter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/reference/bwt/AMBWriter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/AMBWriter.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/ANNWriter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/ANNWriter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/reference/bwt/ANNWriter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/ANNWriter.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWT.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/BWT.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWT.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/BWT.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTReader.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/BWTReader.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTReader.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/BWTReader.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTSupplementaryFileGenerator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/BWTSupplementaryFileGenerator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTSupplementaryFileGenerator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/BWTSupplementaryFileGenerator.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTWriter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/BWTWriter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTWriter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/BWTWriter.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/Bases.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/Bases.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/reference/bwt/Bases.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/Bases.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/Counts.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/Counts.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/reference/bwt/Counts.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/Counts.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/CreateBWTFromReference.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/CreateBWTFromReference.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/reference/bwt/CreateBWTFromReference.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/CreateBWTFromReference.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/SequenceBlock.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/SequenceBlock.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/reference/bwt/SequenceBlock.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/SequenceBlock.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArray.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/SuffixArray.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArray.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/SuffixArray.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArrayReader.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/SuffixArrayReader.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArrayReader.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/SuffixArrayReader.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArrayWriter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/SuffixArrayWriter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArrayWriter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/bwt/SuffixArrayWriter.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/packing/BasePackedInputStream.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/packing/BasePackedInputStream.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/reference/packing/BasePackedInputStream.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/packing/BasePackedInputStream.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/packing/BasePackedOutputStream.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/packing/BasePackedOutputStream.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/reference/packing/BasePackedOutputStream.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/packing/BasePackedOutputStream.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/packing/CreatePACFromReference.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/packing/CreatePACFromReference.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/reference/packing/CreatePACFromReference.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/packing/CreatePACFromReference.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/packing/PackUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/packing/PackUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/reference/packing/PackUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/packing/PackUtils.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/packing/UnsignedIntPackedInputStream.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/packing/UnsignedIntPackedInputStream.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/reference/packing/UnsignedIntPackedInputStream.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/packing/UnsignedIntPackedInputStream.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/packing/UnsignedIntPackedOutputStream.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/packing/UnsignedIntPackedOutputStream.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/alignment/reference/packing/UnsignedIntPackedOutputStream.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/alignment/reference/packing/UnsignedIntPackedOutputStream.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/Advanced.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/Advanced.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/Advanced.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/Advanced.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/Argument.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/Argument.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/Argument.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/Argument.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentCollection.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentCollection.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ArgumentCollection.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentCollection.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinition.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentDefinition.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinition.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentDefinition.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinitionGroup.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentDefinitionGroup.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinitionGroup.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentDefinitionGroup.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinitions.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentDefinitions.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinitions.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentDefinitions.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentException.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentException.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ArgumentException.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentException.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentIOType.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentIOType.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ArgumentIOType.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentIOType.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatch.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentMatch.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ArgumentMatch.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentMatch.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchFileValue.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentMatchFileValue.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchFileValue.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentMatchFileValue.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchSite.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentMatchSite.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchSite.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentMatchSite.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchSource.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentMatchSource.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchSource.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentMatchSource.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchSourceType.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentMatchSourceType.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchSourceType.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentMatchSourceType.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchStringValue.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentMatchStringValue.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchStringValue.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentMatchStringValue.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchValue.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentMatchValue.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchValue.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentMatchValue.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatches.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentMatches.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ArgumentMatches.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentMatches.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentSource.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentSource.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ArgumentSource.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentSource.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java new file mode 100644 index 000000000..8f0abe360 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java @@ -0,0 +1,978 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.commandline; + +import org.apache.commons.io.FileUtils; +import org.apache.log4j.Logger; +import org.broad.tribble.Feature; +import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager; +import org.broadinstitute.sting.gatk.walkers.Multiplex; +import org.broadinstitute.sting.gatk.walkers.Multiplexer; +import org.broadinstitute.sting.utils.classloader.JVMUtils; +import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.io.File; +import java.io.IOException; +import java.lang.annotation.Annotation; +import java.lang.reflect.*; +import java.util.*; + +/** + * An descriptor capable of providing parsers that can parse any type + * of supported command-line argument. + * + * @author mhanna + * @version 0.1 + */ +public abstract class ArgumentTypeDescriptor { + private static Class[] ARGUMENT_ANNOTATIONS = {Input.class, Output.class, Argument.class}; + + /** + * our log, which we want to capture anything from org.broadinstitute.sting + */ + protected static final Logger logger = Logger.getLogger(ArgumentTypeDescriptor.class); + + /** + * Fetch the given descriptor from the descriptor repository. + * @param descriptors the descriptors from which to select a good match. + * @param type Class for which to specify a descriptor. + * @return descriptor for the given type. + */ + public static ArgumentTypeDescriptor selectBest( Collection descriptors, Class type ) { + for( ArgumentTypeDescriptor descriptor: descriptors ) { + if( descriptor.supports(type) ) + return descriptor; + } + throw new ReviewedStingException("Can't process command-line arguments of type: " + type.getName()); + } + + /** + * Does this descriptor support classes of the given type? + * @param type The type to check. + * @return true if this descriptor supports the given type, false otherwise. + */ + public abstract boolean supports( Class type ); + + /** + * Returns false if a type-specific default can be employed. + * @param source Source of the command-line argument. + * @return True to throw in a type specific default. False otherwise. + */ + public boolean createsTypeDefault(ArgumentSource source) { return false; } + + /** + * Returns a documentation-friendly value for the default of a type descriptor. + * Must be overridden if createsTypeDefault return true. cannot be called otherwise + * @param source Source of the command-line argument. + * @return Friendly string of the default value, for documentation. If doesn't create a default, throws + * and UnsupportedOperationException + */ + public String typeDefaultDocString(ArgumentSource source) { + throw new UnsupportedOperationException(); + } + + /** + * Generates a default for the given type. + * + * @param parsingEngine the parsing engine used to validate this argument type descriptor. + * @param source Source of the command-line argument. + * @param type Type of value to create, in case the command-line argument system wants influence. + * @return A default value for the given type. + */ + public Object createTypeDefault(ParsingEngine parsingEngine,ArgumentSource source, Type type) { throw new UnsupportedOperationException("Unable to create default for type " + getClass()); } + + /** + * Given the given argument source and attributes, synthesize argument definitions for command-line arguments. + * @param source Source class and field for the given argument. + * @return A list of command-line argument definitions supporting this field. + */ + public List createArgumentDefinitions( ArgumentSource source ) { + return Collections.singletonList(createDefaultArgumentDefinition(source)); + } + + /** + * Parses an argument source to an object. + * WARNING! Mandatory side effect of parsing! Each parse routine should register the tags it finds with the proper CommandLineProgram. + * TODO: Fix this, perhaps with an event model indicating that a new argument has been created. + * + * @param parsingEngine The engine responsible for parsing. + * @param source The source used to find the matches. + * @param matches The matches for the source. + * @return The parsed object. + */ + public Object parse(ParsingEngine parsingEngine, ArgumentSource source, ArgumentMatches matches) { + return parse(parsingEngine, source, source.field.getGenericType(), matches); + } + + /** + * Returns true if the field is a collection or an array. + * @param source The argument source to check. + * @return true if the field is a collection or an array. + */ + public boolean isMultiValued( ArgumentSource source ) { + Class argumentType = source.field.getType(); + return Collection.class.isAssignableFrom(argumentType) || argumentType.isArray(); + } + + /** + * By default, argument sources create argument definitions with a set of default values. + * Use this method to create the one simple argument definition. + * @param source argument source for which to create a default definition. + * @return The default definition for this argument source. + */ + protected ArgumentDefinition createDefaultArgumentDefinition( ArgumentSource source ) { + Annotation argumentAnnotation = getArgumentAnnotation(source); + return new ArgumentDefinition( ArgumentIOType.getIOType(argumentAnnotation), + source.field.getType(), + ArgumentDefinition.getFullName(argumentAnnotation, source.field.getName()), + ArgumentDefinition.getShortName(argumentAnnotation), + ArgumentDefinition.getDoc(argumentAnnotation), + source.isRequired() && !createsTypeDefault(source) && !source.isFlag() && !source.isDeprecated(), + source.isFlag(), + source.isMultiValued(), + source.isHidden(), + makeRawTypeIfNecessary(getCollectionComponentType(source.field)), + ArgumentDefinition.getExclusiveOf(argumentAnnotation), + ArgumentDefinition.getValidationRegex(argumentAnnotation), + getValidOptions(source) ); + } + + /** + * Return the component type of a field, or String.class if the type cannot be found. + * @param field The reflected field to inspect. + * @return The parameterized component type, or String.class if the parameterized type could not be found. + * @throws IllegalArgumentException If more than one parameterized type is found on the field. + */ + protected Type getCollectionComponentType( Field field ) { + return null; + } + + /** + * Parses the argument matches for a class type into an object. + * @param source The original argument source used to find the matches. + * @param type The current class type being inspected. May not match the argument source.field.getType() if this as a collection for example. + * @param matches The argument matches for the argument source, or the individual argument match for a scalar if this is being called to help parse a collection. + * @return The individual parsed object matching the argument match with Class type. + */ + public abstract Object parse( ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches ); + + /** + * If the argument source only accepts a small set of options, populate the returned list with + * those options. Otherwise, leave the list empty. + * @param source Original field specifying command-line arguments. + * @return A list of valid options. + */ + protected List getValidOptions( ArgumentSource source ) { + if(!source.field.getType().isEnum()) + return null; + List validOptions = new ArrayList(); + for(Object constant: source.field.getType().getEnumConstants()) + validOptions.add(constant.toString()); + return validOptions; + } + + /** + * Returns true if the argument with the given full name exists in the collection of ArgumentMatches. + * @param definition Definition of the argument for which to find matches. + * @param matches The matches for the given argument. + * @return true if the argument is present, or false if not present. + */ + protected boolean argumentIsPresent( ArgumentDefinition definition, ArgumentMatches matches ) { + for( ArgumentMatch match: matches ) { + if( match.definition.equals(definition) ) + return true; + } + return false; + } + + /** + * Gets the value of an argument with the given full name, from the collection of ArgumentMatches. + * If the argument matches multiple values, an exception will be thrown. + * @param definition Definition of the argument for which to find matches. + * @param matches The matches for the given argument. + * @return The value of the argument if available, or null if not present. + */ + protected ArgumentMatchValue getArgumentValue( ArgumentDefinition definition, ArgumentMatches matches ) { + Collection argumentValues = getArgumentValues( definition, matches ); + if( argumentValues.size() > 1 ) + throw new UserException.CommandLineException("Multiple values associated with given definition, but this argument expects only one: " + definition.fullName); + return argumentValues.size() > 0 ? argumentValues.iterator().next() : null; + } + + /** + * Gets the tags associated with a given command-line argument. + * If the argument matches multiple values, an exception will be thrown. + * @param matches The matches for the given argument. + * @return The value of the argument if available, or null if not present. + */ + protected Tags getArgumentTags(ArgumentMatches matches) { + Tags tags = new Tags(); + for(ArgumentMatch match: matches) { + if(!tags.isEmpty() && !match.tags.isEmpty()) + throw new ReviewedStingException("BUG: multiple conflicting sets of tags are available, and the type descriptor specifies no way of resolving the conflict."); + tags = match.tags; + } + return tags; + } + + /** + * Gets the values of an argument with the given full name, from the collection of ArgumentMatches. + * @param definition Definition of the argument for which to find matches. + * @param matches The matches for the given argument. + * @return The value of the argument if available, or an empty collection if not present. + */ + protected Collection getArgumentValues( ArgumentDefinition definition, ArgumentMatches matches ) { + Collection values = new ArrayList(); + for( ArgumentMatch match: matches ) { + if( match.definition.equals(definition) ) + values.addAll(match.values()); + } + return values; + } + + /** + * Retrieves the argument description from the given argument source. Will throw an exception if + * the given ArgumentSource + * @param source source of the argument. + * @return Argument description annotation associated with the given field. + */ + @SuppressWarnings("unchecked") + protected static Annotation getArgumentAnnotation( ArgumentSource source ) { + for (Class annotation: ARGUMENT_ANNOTATIONS) + if (source.field.isAnnotationPresent(annotation)) + return source.field.getAnnotation(annotation); + throw new ReviewedStingException("ArgumentAnnotation is not present for the argument field: " + source.field.getName()); + } + + /** + * Returns true if an argument annotation is present + * @param field The field to check for an annotation. + * @return True if an argument annotation is present on the field. + */ + @SuppressWarnings("unchecked") + public static boolean isArgumentAnnotationPresent(Field field) { + for (Class annotation: ARGUMENT_ANNOTATIONS) + if (field.isAnnotationPresent(annotation)) + return true; + return false; + } + + /** + * Returns true if the given annotation is hidden from the help system. + * @param field Field to test. + * @return True if argument should be hidden. False otherwise. + */ + public static boolean isArgumentHidden(Field field) { + return field.isAnnotationPresent(Hidden.class); + } + + public static Class makeRawTypeIfNecessary(Type t) { + if ( t == null ) + return null; + else if ( t instanceof ParameterizedType ) + return (Class)((ParameterizedType) t).getRawType(); + else if ( t instanceof Class ) { + return (Class)t; + } else { + throw new IllegalArgumentException("Unable to determine Class-derived component type of field: " + t); + } + } + + /** + * The actual argument parsing method. + * @param source source + * @param type type to check + * @param matches matches + * @param tags argument tags + * @return the RodBinding/IntervalBinding object depending on the value of createIntervalBinding. + */ + protected Object parseBinding(ArgumentSource source, Type type, ArgumentMatches matches, Tags tags) { + ArgumentDefinition defaultDefinition = createDefaultArgumentDefinition(source); + ArgumentMatchValue value = getArgumentValue(defaultDefinition, matches); + @SuppressWarnings("unchecked") + Class parameterType = JVMUtils.getParameterizedTypeClass(type); + String name = defaultDefinition.fullName; + + return parseBinding(value, parameterType, type, name, tags, source.field.getName()); + } + + /** + * + * @param value The source of the binding + * @param parameterType The Tribble Feature parameter type + * @param bindingClass The class type for the binding (ex: RodBinding, IntervalBinding, etc.) Must have the correct constructor for creating the binding. + * @param bindingName The name of the binding passed to the constructor. + * @param tags Tags for the binding used for parsing and passed to the constructor. + * @param fieldName The name of the field that was parsed. Used for error reporting. + * @return The newly created binding object of type bindingClass. + */ + public static Object parseBinding(ArgumentMatchValue value, Class parameterType, Type bindingClass, + String bindingName, Tags tags, String fieldName) { + try { + String tribbleType = null; + // must have one or two tag values here + if ( tags.getPositionalTags().size() > 2 ) { + throw new UserException.CommandLineException( + String.format("Unexpected number of positional tags for argument %s : %s. " + + "Rod bindings only support -X:type and -X:name,type argument styles", + value.asString(), fieldName)); + } else if ( tags.getPositionalTags().size() == 2 ) { + // -X:name,type style + bindingName = tags.getPositionalTags().get(0); + tribbleType = tags.getPositionalTags().get(1); + + FeatureManager manager = new FeatureManager(); + if ( manager.getByName(tribbleType) == null ) + throw new UserException.UnknownTribbleType( + tribbleType, + String.format("Unable to find tribble type '%s' provided on the command line. " + + "Please select a correct type from among the supported types:%n%s", + tribbleType, manager.userFriendlyListOfAvailableFeatures(parameterType))); + + } else { + // case with 0 or 1 positional tags + FeatureManager manager = new FeatureManager(); + + // -X:type style is a type when we cannot determine the type dynamically + String tag1 = tags.getPositionalTags().size() == 1 ? tags.getPositionalTags().get(0) : null; + if ( tag1 != null ) { + if ( manager.getByName(tag1) != null ) // this a type + tribbleType = tag1; + else + bindingName = tag1; + } + + if ( tribbleType == null ) { + // try to determine the file type dynamically + File file = value.asFile(); + if ( file.canRead() && file.isFile() ) { + FeatureManager.FeatureDescriptor featureDescriptor = manager.getByFiletype(file); + if ( featureDescriptor != null ) { + tribbleType = featureDescriptor.getName(); + logger.debug("Dynamically determined type of " + file + " to be " + tribbleType); + } + } + + if ( tribbleType == null ) { + // IntervalBinding can be created from a normal String + Class rawType = (makeRawTypeIfNecessary(bindingClass)); + try { + return rawType.getConstructor(String.class).newInstance(value.asString()); + } catch (NoSuchMethodException e) { + /* ignore */ + } + + if ( ! file.exists() ) { + throw new UserException.CouldNotReadInputFile(file, "file does not exist"); + } else if ( ! file.canRead() || ! file.isFile() ) { + throw new UserException.CouldNotReadInputFile(file, "file could not be read"); + } else { + throw new UserException.CommandLineException( + String.format("No tribble type was provided on the command line and the type of the file could not be determined dynamically. " + + "Please add an explicit type tag :NAME listing the correct type from among the supported types:%n%s", + manager.userFriendlyListOfAvailableFeatures(parameterType))); + } + } + } + } + + Constructor ctor = (makeRawTypeIfNecessary(bindingClass)).getConstructor(Class.class, String.class, String.class, String.class, Tags.class); + return ctor.newInstance(parameterType, bindingName, value.asString(), tribbleType, tags); + } catch (Exception e) { + if ( e instanceof UserException ) + throw ((UserException)e); + else + throw new UserException.CommandLineException( + String.format("Failed to parse value %s for argument %s. Message: %s", + value, fieldName, e.getMessage())); + } + } + + /** + * Parse the source of a RodBindingCollection, which can be either a file of RodBindings or an actual RodBinding. + * + * @param parsingEngine the parsing engine used to validate this argument type descriptor + * @param source source + * @param type type + * @param matches matches + * @param tags argument tags + * @return the newly created binding object + */ + public Object parseRodBindingCollectionSource(final ParsingEngine parsingEngine, + final ArgumentSource source, + final Type type, + final ArgumentMatches matches, + final Tags tags) { + + final ArgumentDefinition defaultDefinition = createDefaultArgumentDefinition(source); + final ArgumentMatchValue value = getArgumentValue(defaultDefinition, matches); + @SuppressWarnings("unchecked") + Class parameterType = JVMUtils.getParameterizedTypeClass(type); + String name = defaultDefinition.fullName; + + // if this a list of files, get those bindings + final File file = value.asFile(); + try { + if (file.getAbsolutePath().endsWith(".list")) { + return getRodBindingsCollection(file, parsingEngine, parameterType, name, tags, source.field.getName()); + } + } catch (IOException e) { + throw new UserException.CouldNotReadInputFile(file, e); + } + + // otherwise, treat this as an individual binding + final RodBinding binding = (RodBinding)parseBinding(value, parameterType, RodBinding.class, name, tags, source.field.getName()); + parsingEngine.addTags(binding, tags); + parsingEngine.addRodBinding(binding); + return RodBindingCollection.createRodBindingCollectionOfType(parameterType, Arrays.asList(binding)); + } + + /** + * Retrieve and parse a collection of RodBindings from the given file. + * + * @param file the source file + * @param parsingEngine the engine responsible for parsing + * @param parameterType the Tribble Feature parameter type + * @param bindingName the name of the binding passed to the constructor. + * @param defaultTags general tags for the binding used for parsing and passed to the constructor. + * @param fieldName the name of the field that was parsed. Used for error reporting. + * @return the newly created collection of binding objects. + */ + public static Object getRodBindingsCollection(final File file, + final ParsingEngine parsingEngine, + final Class parameterType, + final String bindingName, + final Tags defaultTags, + final String fieldName) throws IOException { + final List bindings = new ArrayList<>(); + + // parse each line separately using the given Tags if none are provided on each line + for ( final String line: FileUtils.readLines(file) ) { + final String[] tokens = line.split("\\s+"); + final RodBinding binding; + + if ( tokens.length == 0 ) { + continue; // empty line, so do nothing + } + // use the default tags if none are provided for this binding + else if ( tokens.length == 1 ) { + final ArgumentMatchValue value = new ArgumentMatchStringValue(tokens[0]); + binding = (RodBinding)parseBinding(value, parameterType, RodBinding.class, bindingName, defaultTags, fieldName); + parsingEngine.addTags(binding, defaultTags); + } + // use the new tags if provided + else if ( tokens.length == 2 ) { + final Tags tags = ParsingMethod.parseTags(fieldName, tokens[0]); + final ArgumentMatchValue value = new ArgumentMatchStringValue(tokens[1]); + binding = (RodBinding)parseBinding(value, parameterType, RodBinding.class, bindingName, tags, fieldName); + parsingEngine.addTags(binding, tags); + } else { + throw new UserException.BadArgumentValue(fieldName, "data lines should consist of an optional set of tags along with a path to a file; too many tokens are present for line: " + line); + } + + bindings.add(binding); + parsingEngine.addRodBinding(binding); + } + + return RodBindingCollection.createRodBindingCollectionOfType(parameterType, bindings); + } +} + +/** + * Parser for RodBinding objects + */ +class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor { + /** + * We only want RodBinding class objects + * @param type The type to check. + * @return true if the provided class is a RodBinding.class + */ + @Override + public boolean supports( Class type ) { + return isRodBinding(type); + } + + public static boolean isRodBinding( Class type ) { + return RodBinding.class.isAssignableFrom(type); + } + + @Override + public boolean createsTypeDefault(ArgumentSource source) { return ! source.isRequired(); } + + @Override + @SuppressWarnings("unchecked") + public Object createTypeDefault(ParsingEngine parsingEngine, ArgumentSource source, Type type) { + Class parameterType = JVMUtils.getParameterizedTypeClass(type); + return RodBinding.makeUnbound((Class)parameterType); + } + + @Override + public String typeDefaultDocString(ArgumentSource source) { + return "none"; + } + + @Override + public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches) { + Tags tags = getArgumentTags(matches); + RodBinding rbind = (RodBinding)parseBinding(source, type, matches, tags); + parsingEngine.addTags(rbind, tags); + parsingEngine.addRodBinding(rbind); + return rbind; + } +} + +/** + * Parser for IntervalBinding objects + */ +class IntervalBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor { + /** + * We only want IntervalBinding class objects + * @param type The type to check. + * @return true if the provided class is an IntervalBinding.class + */ + @Override + public boolean supports( Class type ) { + return isIntervalBinding(type); + } + + public static boolean isIntervalBinding( Class type ) { + return IntervalBinding.class.isAssignableFrom(type); + } + + /** + * See note from RodBindingArgumentTypeDescriptor.parse(). + * + * @param parsingEngine parsing engine + * @param source source + * @param type type to check + * @param matches matches + * @return the IntervalBinding object. + */ + @Override + public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches) { + return parseBinding(source, type, matches, getArgumentTags(matches)); + } +} + +/** + * Parser for RodBindingCollection objects + */ +class RodBindingCollectionArgumentTypeDescriptor extends ArgumentTypeDescriptor { + /** + * We only want RodBindingCollection class objects + * @param type The type to check. + * @return true if the provided class is an RodBindingCollection.class + */ + @Override + public boolean supports( final Class type ) { + return isRodBindingCollection(type); + } + + public static boolean isRodBindingCollection( final Class type ) { + return RodBindingCollection.class.isAssignableFrom(type); + } + + /** + * See note from RodBindingArgumentTypeDescriptor.parse(). + * + * @param parsingEngine parsing engine + * @param source source + * @param type type to check + * @param matches matches + * @return the IntervalBinding object. + */ + @Override + public Object parse(final ParsingEngine parsingEngine, final ArgumentSource source, final Type type, final ArgumentMatches matches) { + final Tags tags = getArgumentTags(matches); + return parseRodBindingCollectionSource(parsingEngine, source, type, matches, tags); + } +} + +/** + * Parse simple argument types: java primitives, wrapper classes, and anything that has + * a simple String constructor. + */ +class SimpleArgumentTypeDescriptor extends ArgumentTypeDescriptor { + + /** + * @param type the class type + * @return true if this class is a binding type, false otherwise + */ + private boolean isBinding(final Class type) { + return RodBindingArgumentTypeDescriptor.isRodBinding(type) || + IntervalBindingArgumentTypeDescriptor.isIntervalBinding(type) || + RodBindingCollectionArgumentTypeDescriptor.isRodBindingCollection(type); + } + + + @Override + public boolean supports( Class type ) { + if ( isBinding(type) ) return false; + if ( type.isPrimitive() ) return true; + if ( type.isEnum() ) return true; + if ( primitiveToWrapperMap.containsValue(type) ) return true; + + try { + type.getConstructor(String.class); + return true; + } + catch( Exception ex ) { + // An exception thrown above means that the String constructor either doesn't + // exist or can't be accessed. In either case, this descriptor doesn't support this type. + return false; + } + } + + @Override + public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type fulltype, ArgumentMatches matches) { + Class type = makeRawTypeIfNecessary(fulltype); + if (source.isFlag()) + return true; + + ArgumentDefinition defaultDefinition = createDefaultArgumentDefinition(source); + ArgumentMatchValue value = getArgumentValue(defaultDefinition, matches); + Object result; + Tags tags = getArgumentTags(matches); + + // lets go through the types we support + try { + if (type.isPrimitive()) { + Method valueOf = primitiveToWrapperMap.get(type).getMethod("valueOf",String.class); + if(value == null) + throw new MissingArgumentValueException(createDefaultArgumentDefinition(source)); + result = valueOf.invoke(null,value.asString().trim()); + } else if (type.isEnum()) { + Object[] vals = type.getEnumConstants(); + Object defaultEnumeration = null; // as we look at options, record the default option if it exists + for (Object val : vals) { + if (String.valueOf(val).equalsIgnoreCase(value == null ? null : value.asString())) return val; + try { if (type.getField(val.toString()).isAnnotationPresent(EnumerationArgumentDefault.class)) defaultEnumeration = val; } + catch (NoSuchFieldException e) { throw new ReviewedStingException("parsing " + type.toString() + "doesn't contain the field " + val.toString()); } + } + // if their argument has no value (null), and there's a default, return that default for the enum value + if (defaultEnumeration != null && value == null) + result = defaultEnumeration; + // if their argument has no value and there's no default, throw a missing argument value exception. + // TODO: Clean this up so that null values never make it to this point. To fix this, we'll have to clean up the implementation of -U. + else if (value == null) + throw new MissingArgumentValueException(createDefaultArgumentDefinition(source)); + else + throw new UnknownEnumeratedValueException(createDefaultArgumentDefinition(source),value.asString()); + } else if (type.equals(File.class)) { + result = value == null ? null : value.asFile(); + } else { + Constructor ctor = type.getConstructor(String.class); + result = ctor.newInstance(value == null ? null : value.asString()); + } + } catch (UserException e) { + throw e; + } catch (InvocationTargetException e) { + throw new UserException.CommandLineException(String.format("Failed to parse value %s for argument %s. This is most commonly caused by providing an incorrect data type (e.g. a double when an int is required)", + value, source.field.getName())); + } catch (Exception e) { + throw new DynamicClassResolutionException(String.class, e); + } + + // TODO FIXME! + + // WARNING: Side effect! + parsingEngine.addTags(result,tags); + + return result; + } + + + /** + * A mapping of the primitive types to their associated wrapper classes. Is there really no way to infer + * this association available in the JRE? + */ + private static Map primitiveToWrapperMap = new HashMap() { + { + put( Boolean.TYPE, Boolean.class ); + put( Character.TYPE, Character.class ); + put( Byte.TYPE, Byte.class ); + put( Short.TYPE, Short.class ); + put( Integer.TYPE, Integer.class ); + put( Long.TYPE, Long.class ); + put( Float.TYPE, Float.class ); + put( Double.TYPE, Double.class ); + } + }; +} + +/** + * Process compound argument types: arrays, and typed and untyped collections. + */ +class CompoundArgumentTypeDescriptor extends ArgumentTypeDescriptor { + @Override + public boolean supports( Class type ) { + return ( Collection.class.isAssignableFrom(type) || type.isArray() ); + } + + @Override + @SuppressWarnings("unchecked") + public Object parse(ParsingEngine parsingEngine,ArgumentSource source, Type fulltype, ArgumentMatches matches) { + Class type = makeRawTypeIfNecessary(fulltype); + Type componentType; + Object result; + + if( Collection.class.isAssignableFrom(type) ) { + + // If this is a generic interface, pick a concrete implementation to create and pass back. + // Because of type erasure, don't worry about creating one of exactly the correct type. + if( Modifier.isInterface(type.getModifiers()) || Modifier.isAbstract(type.getModifiers()) ) + { + if( java.util.List.class.isAssignableFrom(type) ) type = ArrayList.class; + else if( java.util.Queue.class.isAssignableFrom(type) ) type = java.util.ArrayDeque.class; + else if( java.util.Set.class.isAssignableFrom(type) ) type = java.util.TreeSet.class; + } + + componentType = getCollectionComponentType( source.field ); + ArgumentTypeDescriptor componentArgumentParser = parsingEngine.selectBestTypeDescriptor(makeRawTypeIfNecessary(componentType)); + + Collection collection; + try { + collection = (Collection)type.newInstance(); + } + catch (InstantiationException e) { + logger.fatal("ArgumentParser: InstantiationException: cannot convert field " + source.field.getName()); + throw new ReviewedStingException("constructFromString:InstantiationException: Failed conversion " + e.getMessage()); + } + catch (IllegalAccessException e) { + logger.fatal("ArgumentParser: IllegalAccessException: cannot convert field " + source.field.getName()); + throw new ReviewedStingException("constructFromString:IllegalAccessException: Failed conversion " + e.getMessage()); + } + + for( ArgumentMatch match: matches ) { + for( ArgumentMatch value: match ) { + Object object = componentArgumentParser.parse(parsingEngine,source,componentType,new ArgumentMatches(value)); + collection.add( object ); + // WARNING: Side effect! + parsingEngine.addTags(object,value.tags); + } + } + + result = collection; + + } + else if( type.isArray() ) { + componentType = type.getComponentType(); + ArgumentTypeDescriptor componentArgumentParser = parsingEngine.selectBestTypeDescriptor(makeRawTypeIfNecessary(componentType)); + + // Assemble a collection of individual values used in this computation. + Collection values = new ArrayList(); + for( ArgumentMatch match: matches ) + for( ArgumentMatch value: match ) + values.add(value); + + result = Array.newInstance(makeRawTypeIfNecessary(componentType),values.size()); + + int i = 0; + for( ArgumentMatch value: values ) { + Object object = componentArgumentParser.parse(parsingEngine,source,componentType,new ArgumentMatches(value)); + Array.set(result,i++,object); + // WARNING: Side effect! + parsingEngine.addTags(object,value.tags); + } + } + else + throw new ReviewedStingException("Unsupported compound argument type: " + type); + + return result; + } + + /** + * Return the component type of a field, or String.class if the type cannot be found. + * @param field The reflected field to inspect. + * @return The parameterized component type, or String.class if the parameterized type could not be found. + * @throws IllegalArgumentException If more than one parameterized type is found on the field. + */ + @Override + protected Type getCollectionComponentType( Field field ) { + // If this is a parameterized collection, find the contained type. If blow up if more than one type exists. + if( field.getGenericType() instanceof ParameterizedType) { + ParameterizedType parameterizedType = (ParameterizedType)field.getGenericType(); + if( parameterizedType.getActualTypeArguments().length > 1 ) + throw new IllegalArgumentException("Unable to determine collection type of field: " + field.toString()); + return parameterizedType.getActualTypeArguments()[0]; + } + else + return String.class; + } +} + +class MultiplexArgumentTypeDescriptor extends ArgumentTypeDescriptor { + /** + * The multiplexer controlling how data is split. + */ + private final Multiplexer multiplexer; + + /** + * The set of identifiers for the multiplexed entries. + */ + private final Collection multiplexedIds; + + public MultiplexArgumentTypeDescriptor() { + this.multiplexer = null; + this.multiplexedIds = null; + } + + /** + * Private constructor to use in creating a closure of the MultiplexArgumentTypeDescriptor specific to the + * given set of multiplexed ids. + * @param multiplexedIds The collection of multiplexed entries + */ + private MultiplexArgumentTypeDescriptor(final Multiplexer multiplexer, final Collection multiplexedIds) { + this.multiplexer = multiplexer; + this.multiplexedIds = multiplexedIds; + } + + @Override + public boolean supports( Class type ) { + return ( Map.class.isAssignableFrom(type) ); + } + + @Override + public boolean createsTypeDefault(ArgumentSource source) { + // Multiplexing always creates a type default. + return true; + } + + @Override + public Object createTypeDefault(ParsingEngine parsingEngine,ArgumentSource source, Type type) { + if(multiplexer == null || multiplexedIds == null) + throw new ReviewedStingException("No multiplexed ids available"); + + Map multiplexedMapping = new HashMap(); + Class componentType = makeRawTypeIfNecessary(getCollectionComponentType(source.field)); + ArgumentTypeDescriptor componentTypeDescriptor = parsingEngine.selectBestTypeDescriptor(componentType); + + for(Object id: multiplexedIds) { + Object value = null; + if(componentTypeDescriptor.createsTypeDefault(source)) + value = componentTypeDescriptor.createTypeDefault(parsingEngine,source,componentType); + multiplexedMapping.put(id,value); + } + return multiplexedMapping; + } + + @Override + public String typeDefaultDocString(ArgumentSource source) { + return "None"; + } + + @Override + public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches) { + if(multiplexedIds == null) + throw new ReviewedStingException("Cannot directly parse a MultiplexArgumentTypeDescriptor; must create a derivative type descriptor first."); + + Map multiplexedMapping = new HashMap(); + + Class componentType = makeRawTypeIfNecessary(getCollectionComponentType(source.field)); + + + for(Object id: multiplexedIds) { + Object value = parsingEngine.selectBestTypeDescriptor(componentType).parse(parsingEngine,source,componentType,matches.transform(multiplexer,id)); + multiplexedMapping.put(id,value); + } + + parsingEngine.addTags(multiplexedMapping,getArgumentTags(matches)); + + return multiplexedMapping; + } + + public MultiplexArgumentTypeDescriptor createCustomTypeDescriptor(ParsingEngine parsingEngine,ArgumentSource dependentArgument,Object containingObject) { + String[] sourceFields = dependentArgument.field.getAnnotation(Multiplex.class).arguments(); + + List allSources = parsingEngine.extractArgumentSources(containingObject.getClass()); + Class[] sourceTypes = new Class[sourceFields.length]; + Object[] sourceValues = new Object[sourceFields.length]; + int currentField = 0; + + for(String sourceField: sourceFields) { + boolean fieldFound = false; + for(ArgumentSource source: allSources) { + if(!source.field.getName().equals(sourceField)) + continue; + if(source.field.isAnnotationPresent(Multiplex.class)) + throw new ReviewedStingException("Command-line arguments can only depend on independent fields"); + sourceTypes[currentField] = source.field.getType(); + sourceValues[currentField] = JVMUtils.getFieldValue(source.field,containingObject); + currentField++; + fieldFound = true; + } + if(!fieldFound) + throw new ReviewedStingException(String.format("Unable to find source field %s, referred to by dependent field %s",sourceField,dependentArgument.field.getName())); + } + + Class multiplexerType = dependentArgument.field.getAnnotation(Multiplex.class).value(); + Constructor multiplexerConstructor; + try { + multiplexerConstructor = multiplexerType.getConstructor(sourceTypes); + multiplexerConstructor.setAccessible(true); + } + catch(NoSuchMethodException ex) { + throw new ReviewedStingException(String.format("Unable to find constructor for class %s with parameters %s",multiplexerType.getName(),Arrays.deepToString(sourceFields)),ex); + } + + Multiplexer multiplexer; + try { + multiplexer = multiplexerConstructor.newInstance(sourceValues); + } + catch(IllegalAccessException ex) { + throw new ReviewedStingException(String.format("Constructor for class %s with parameters %s is inaccessible",multiplexerType.getName(),Arrays.deepToString(sourceFields)),ex); + } + catch(InstantiationException ex) { + throw new ReviewedStingException(String.format("Can't create class %s with parameters %s",multiplexerType.getName(),Arrays.deepToString(sourceFields)),ex); + } + catch(InvocationTargetException ex) { + throw new ReviewedStingException(String.format("Can't invoke constructor of class %s with parameters %s",multiplexerType.getName(),Arrays.deepToString(sourceFields)),ex); + } + + return new MultiplexArgumentTypeDescriptor(multiplexer,multiplexer.multiplex()); + } + + /** + * Return the component type of a field, or String.class if the type cannot be found. + * @param field The reflected field to inspect. + * @return The parameterized component type, or String.class if the parameterized type could not be found. + * @throws IllegalArgumentException If more than one parameterized type is found on the field. + */ + @Override + protected Type getCollectionComponentType( Field field ) { + // Multiplex arguments must resolve to maps from which the clp should extract the second type. + if( field.getGenericType() instanceof ParameterizedType) { + ParameterizedType parameterizedType = (ParameterizedType)field.getGenericType(); + if( parameterizedType.getActualTypeArguments().length != 2 ) + throw new IllegalArgumentException("Unable to determine collection type of field: " + field.toString()); + return (Class)parameterizedType.getActualTypeArguments()[1]; + } + else + return String.class; + } +} diff --git a/public/java/src/org/broadinstitute/sting/commandline/ClassType.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ClassType.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ClassType.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ClassType.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/CommandLineProgram.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/CommandLineProgram.java new file mode 100644 index 000000000..8c7e11f35 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/CommandLineProgram.java @@ -0,0 +1,447 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.commandline; + +import org.apache.log4j.FileAppender; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.apache.log4j.PatternLayout; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.help.ApplicationDetails; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.help.HelpFormatter; + +import java.io.IOException; +import java.util.*; + +public abstract class CommandLineProgram { + + /** The command-line program and the arguments it returned. */ + public ParsingEngine parser = null; + + /** + * Setting INFO gets you INFO up to FATAL, setting ERROR gets you ERROR and FATAL level logging, and so on. + */ + @Argument(fullName = "logging_level", shortName = "l", doc = "Set the minimum level of logging", required = false) + protected String logging_level = "INFO"; + + /** + * File to save the logging output. + */ + @Output(fullName = "log_to_file", shortName = "log", doc = "Set the logging location", required = false) + protected String toFile = null; + + /** + * This will produce a help message in the terminal with general usage information, listing available arguments + * as well as tool-specific information if applicable. + */ + @Argument(fullName = "help", shortName = "h", doc = "Generate the help message", required = false) + public Boolean help = false; + + /** + * Use this to check the version number of the GATK executable you are invoking. Note that the version number is + * always included in the output at the start of every run as well as any error message. + */ + @Argument(fullName = "version", shortName = "version", doc ="Output version information", required = false) + public Boolean version = false; + + + /** our logging output patterns */ + private static final String patternString = "%-5p %d{HH:mm:ss,SSS} %C{1} - %m %n"; + + static { + /** + * The very first thing that any Sting application does is forces the JVM locale into US English, so that we don't have + * to think about number formatting issues. + */ + forceJVMLocaleToUSEnglish(); + // setup a basic log configuration + CommandLineUtils.configureConsoleLogging(); + } + + + /** + * Allows a given application to return a brief description of itself. + * + * @return An ApplicationDetails object describing the current application. Should not be null. + */ + protected ApplicationDetails getApplicationDetails() { + return new ApplicationDetails(ApplicationDetails.createDefaultHeader(getClass()), + Collections.emptyList(), + ApplicationDetails.createDefaultRunningInstructions(getClass()), + null); + } + + /** + * Subclasses of CommandLinePrograms can provide their own types of command-line arguments. + * @return A collection of type descriptors generating implementation-dependent placeholders. + */ + protected Collection getArgumentTypeDescriptors() { + return Collections.emptyList(); + } + + /** + * Will this application want to vary its argument list dynamically? + * If so, parse the command-line options and then prompt the subclass to return + * a list of argument providers. + * + * @return Whether the application should vary command-line arguments dynamically. + */ + protected boolean canAddArgumentsDynamically() { return false; } + + /** + * Provide a list of object to inspect, looking for additional command-line arguments. + * + * @return A list of objects to inspect. + */ + protected Class[] getArgumentSources() { + return new Class[]{}; + } + + /** + * Name this argument source. Provides the (full) class name as a default. + * + * @param source The argument source. + * + * @return a name for the argument source. + */ + protected String getArgumentSourceName( Class source ) { return source.toString(); } + + /** + * Sets the command-line parsing engine. Necessary for unit testing purposes. + * @param parser the new command-line parsing engine + */ + public void setParser( ParsingEngine parser ) { + this.parser = parser; + } + + /** + * this is the function that the inheriting class can expect to have called + * when all the argument processing is done + * + * @return the return code to exit the program with + * @throws Exception when an exception occurs + */ + protected abstract int execute() throws Exception; + + public static int result = -1; + + @SuppressWarnings("unchecked") + public static void start(CommandLineProgram clp, String[] args) throws Exception { + start(clp, args, false); + } + + /** + * This function is called to start processing the command line, and kick + * off the execute message of the program. + * + * @param clp the command line program to execute + * @param args the command line arguments passed in + * @param dryRun dry run + * @throws Exception when an exception occurs + */ + @SuppressWarnings("unchecked") + public static void start(CommandLineProgram clp, String[] args, boolean dryRun) throws Exception { + + try { + // setup our log layout + PatternLayout layout = new PatternLayout(); + + Logger logger = CommandLineUtils.getStingLogger(); + + // now set the layout of all the loggers to our layout + CommandLineUtils.setLayout(logger, layout); + + // Initialize the logger using the defaults. + clp.setupLoggerLevel(layout); + + // setup the parser + ParsingEngine parser = clp.parser = new ParsingEngine(clp); + parser.addArgumentSource(clp.getClass()); + + Map parsedArgs; + + // process the args + if (clp.canAddArgumentsDynamically()) { + // if the command-line program can toss in extra args, fetch them and reparse the arguments. + parser.parse(args); + + // Allow invalid and missing required arguments to pass this validation step. + // - InvalidArgument in case these arguments are specified by plugins. + // - MissingRequiredArgument in case the user requested help. Handle that later, once we've + // determined the full complement of arguments. + if ( ! dryRun ) + parser.validate(EnumSet.of(ParsingEngine.ValidationType.MissingRequiredArgument, + ParsingEngine.ValidationType.InvalidArgument)); + parser.loadArgumentsIntoObject(clp); + + // Initialize the logger using the loaded command line. + clp.setupLoggerLevel(layout); + + Class[] argumentSources = clp.getArgumentSources(); + for (Class argumentSource : argumentSources) + parser.addArgumentSource(clp.getArgumentSourceName(argumentSource), argumentSource); + parsedArgs = parser.parse(args); + + if (isVersionPresent(parser)) + printVersionAndExit(); + + if (isHelpPresent(parser)) + printHelpAndExit(clp, parser); + + if ( ! dryRun ) parser.validate(); + } else { + parsedArgs = parser.parse(args); + + if ( ! dryRun ) { + if (isHelpPresent(parser)) + printHelpAndExit(clp, parser); + + parser.validate(); + } + parser.loadArgumentsIntoObject(clp); + + // Initialize the logger using the loaded command line. + clp.setupLoggerLevel(layout); + } + + if ( ! dryRun ) { + // if they specify a log location, output our data there + if (clp.toFile != null) { + FileAppender appender; + try { + appender = new FileAppender(layout, clp.toFile, false); + logger.addAppender(appender); + } catch (IOException e) { + throw new RuntimeException("Unable to re-route log output to " + clp.toFile + " make sure the destination exists"); + } + } + + // regardless of what happens next, generate the header information + HelpFormatter.generateHeaderInformation(clp.getApplicationDetails(), parsedArgs); + + // call the execute + CommandLineProgram.result = clp.execute(); + } + } + catch (ArgumentException e) { + //clp.parser.printHelp(clp.getApplicationDetails()); + // Rethrow the exception to exit with an error. + throw e; + } + } + + /** + * Find fields in the object obj that look like command-line arguments, and put command-line + * arguments into them. + * + * @param obj Object to inspect for command line arguments. + */ + public void loadArgumentsIntoObject(Object obj) { + parser.loadArgumentsIntoObject(obj); + } + + /** + * this function checks the logger level passed in on the command line, taking the lowest + * level that was provided. + * @param layout Pattern layout to format based on the logger level. + */ + private void setupLoggerLevel(PatternLayout layout) { + layout.setConversionPattern(patternString); + + // set the default logger level + Level par; + if (logging_level.toUpperCase().equals("DEBUG")) { + par = Level.DEBUG; + } else if (logging_level.toUpperCase().equals("ERROR")) { + par = Level.ERROR; + } else if (logging_level.toUpperCase().equals("FATAL")) { + par = Level.FATAL; + } else if (logging_level.toUpperCase().equals("INFO")) { + par = Level.INFO; + } else if (logging_level.toUpperCase().equals("WARN")) { + par = Level.WARN; + } else if (logging_level.toUpperCase().equals("OFF")) { + par = Level.OFF; + } else { + // we don't understand the logging level, let's get out of here + throw new ArgumentException("Unable to match: " + logging_level + " to a logging level, make sure it's a valid level (INFO, DEBUG, ERROR, FATAL, OFF)"); + } + + Logger.getRootLogger().setLevel(par); + } + + /** + * a function used to indicate an error occurred in the command line tool + */ + private static void printDocumentationReference() { + errorPrintf("Visit our website and forum for extensive documentation and answers to %n"); + errorPrintf("commonly asked questions " + HelpConstants.BASE_GATK_URL + "%n"); + } + + + /** + * Do a cursory search for the given argument. + * + * @param parser Parser + * + * @return True if help is present; false otherwise. + */ + private static boolean isHelpPresent(ParsingEngine parser) { + return parser.isArgumentPresent("help"); + } + + /** + * Print help and exit. + * + * @param clp Instance of the command-line program. + * @param parser True if help is present; false otherwise. + */ + private static void printHelpAndExit(CommandLineProgram clp, ParsingEngine parser) { + parser.printHelp(clp.getApplicationDetails()); + System.exit(0); + } + + /** + * Do a cursory search for the argument "version". + * + * @param parser Parser + * + * @return True if version is present; false otherwise. + */ + private static boolean isVersionPresent(ParsingEngine parser) { + return parser.isArgumentPresent("version"); + } + + /** + * Print help and exit. + */ + private static void printVersionAndExit() { + System.out.println(CommandLineGATK.getVersionNumber().toString()); + System.exit(0); + } + + + private static void errorPrintf(String format, Object... s) { + String formatted = String.format(format, s); + + if ( formatted.trim().equals("") ) + System.err.println("##### ERROR"); + else { + for ( String part : formatted.split("\n") ) { + System.err.println("##### ERROR " + part); + } + } + } + + + /** + * used to indicate an error occured + * + * @param msg the message + * @param t the error + */ + public static void exitSystemWithError(String msg, final Throwable t) { + errorPrintf("------------------------------------------------------------------------------------------%n"); + errorPrintf("stack trace %n"); + t.printStackTrace(); + + errorPrintf("------------------------------------------------------------------------------------------%n"); + errorPrintf("A GATK RUNTIME ERROR has occurred (version %s):%n", CommandLineGATK.getVersionNumber()); + errorPrintf("%n"); + errorPrintf("This might be a bug. Please check the documentation guide to see if this is a known problem.%n"); + errorPrintf("If not, please post the error message, with stack trace, to the GATK forum.%n"); + printDocumentationReference(); + if ( msg == null ) // some exceptions don't have detailed messages + msg = "Code exception (see stack trace for error itself)"; + errorPrintf("%n"); + errorPrintf("MESSAGE: %s%n", msg.trim()); + errorPrintf("------------------------------------------------------------------------------------------%n"); + System.exit(1); + } + + public static void exitSystemWithUserError(final Exception e) { + if ( e.getMessage() == null ) + throw new ReviewedStingException("UserException found with no message!", e); + + errorPrintf("------------------------------------------------------------------------------------------%n"); + errorPrintf("A USER ERROR has occurred (version %s): %n", CommandLineGATK.getVersionNumber()); + errorPrintf("%n"); + errorPrintf("This means that one or more arguments or inputs in your command are incorrect.%n"); + errorPrintf("The error message below tells you what is the problem.%n"); + errorPrintf("%n"); + errorPrintf("If the problem is an invalid argument, please check the online documentation guide%n"); + errorPrintf("(or rerun your command with --help) to view allowable command-line arguments for this tool.%n"); + errorPrintf("%n"); + printDocumentationReference(); + errorPrintf("%n"); + errorPrintf("Please do NOT post this error to the GATK forum unless you have really tried to fix it yourself.%n"); + errorPrintf("%n"); + errorPrintf("MESSAGE: %s%n", e.getMessage().trim()); + errorPrintf("------------------------------------------------------------------------------------------%n"); + System.exit(1); + } + + public static void exitSystemWithSamError(final Throwable t) { + if ( t.getMessage() == null ) + throw new ReviewedStingException("SamException found with no message!", t); + + errorPrintf("------------------------------------------------------------------------------------------%n"); + errorPrintf("A BAM ERROR has occurred (version %s): %n", CommandLineGATK.getVersionNumber()); + errorPrintf("%n"); + errorPrintf("This means that there is something wrong with the BAM file(s) you provided.%n"); + errorPrintf("The error message below tells you what is the problem.%n"); + errorPrintf("%n"); + printDocumentationReference(); + errorPrintf("%n"); + errorPrintf("Please do NOT post this error to the GATK forum until you have followed these instructions:%n"); + errorPrintf("- Make sure that your BAM file is well-formed by running Picard's validator on it%n"); + errorPrintf("(see http://picard.sourceforge.net/command-line-overview.shtml#ValidateSamFile for details)%n"); + errorPrintf("- Ensure that your BAM index is not corrupted: delete the current one and regenerate it with 'samtools index'%n"); + errorPrintf("%n"); + errorPrintf("MESSAGE: %s%n", t.getMessage().trim()); + errorPrintf("------------------------------------------------------------------------------------------%n"); + System.exit(1); + } + + + /** + * used to indicate an error occured + * + * @param t the exception that occurred + */ + public static void exitSystemWithError(Throwable t) { + exitSystemWithError(t.getMessage(), t); + } + + /** + * A hack to ensure that numbers are always formatted in the US style. + */ + protected static void forceJVMLocaleToUSEnglish() { + Locale.setDefault(Locale.US); + } +} diff --git a/public/java/src/org/broadinstitute/sting/commandline/CommandLineUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/CommandLineUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/CommandLineUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/CommandLineUtils.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/EnumerationArgumentDefault.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/EnumerationArgumentDefault.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/EnumerationArgumentDefault.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/EnumerationArgumentDefault.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/Gather.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/Gather.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/Gather.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/Gather.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/Gatherer.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/Gatherer.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/Gatherer.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/Gatherer.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/Hidden.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/Hidden.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/Hidden.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/Hidden.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/Input.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/Input.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/Input.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/Input.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/IntervalArgumentCollection.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/IntervalArgumentCollection.java new file mode 100644 index 000000000..d2a1735fb --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/IntervalArgumentCollection.java @@ -0,0 +1,85 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.commandline; + +import org.broad.tribble.Feature; +import org.broadinstitute.sting.utils.interval.IntervalMergingRule; +import org.broadinstitute.sting.utils.interval.IntervalSetRule; + +import java.util.List; + +public class IntervalArgumentCollection { + /** + * Use this option to perform the analysis over only part of the genome. This argument can be specified multiple times. + * You can use samtools-style intervals either explicitly on the command line (e.g. -L chr1 or -L chr1:100-200) or + * by loading in a file containing a list of intervals (e.g. -L myFile.intervals). + * + * Additionally, you can also specify a ROD file (such as a VCF file) in order to perform the analysis at specific + * positions based on the records present in the file (e.g. -L file.vcf). + * + * Finally, you can also use this to perform the analysis on the reads that are completely unmapped in the BAM file + * (i.e. those without a reference contig) by specifying -L unmapped. + */ + @Input(fullName = "intervals", shortName = "L", doc = "One or more genomic intervals over which to operate", required = false) + public List> intervals = null; + + /** + * Use this option to exclude certain parts of the genome from the analysis (like -L, but the opposite). + * This argument can be specified multiple times. You can use samtools-style intervals either explicitly on the + * command line (e.g. -XL chr1 or -XL chr1:100-200) or by loading in a file containing a list of intervals + * (e.g. -XL myFile.intervals). + * + * Additionally, you can also specify a ROD file (such as a VCF file) in order to exclude specific + * positions from the analysis based on the records present in the file (e.g. -L file.vcf). + * */ + @Input(fullName = "excludeIntervals", shortName = "XL", doc = "One or more genomic intervals to exclude from processing", required = false) + public List> excludeIntervals = null; + + /** + * By default, the program will take the UNION of all intervals specified using -L and/or -XL. However, you can + * change this setting, for example if you want to take the INTERSECTION of the sets instead. E.g. to perform the + * analysis on positions for which there is a record in a VCF, but restrict this to just those on chromosome 20, + * you would do -L chr20 -L file.vcf -isr INTERSECTION. + */ + @Argument(fullName = "interval_set_rule", shortName = "isr", doc = "Set merging approach to use for combining interval inputs", required = false) + public IntervalSetRule intervalSetRule = IntervalSetRule.UNION; + + /** + * By default, the program merges abutting intervals (i.e. intervals that are directly side-by-side but do not + * actually overlap) into a single continuous interval. However you can change this behavior if you want them to be + * treated as separate intervals instead. + */ + @Argument(fullName = "interval_merging", shortName = "im", doc = "Interval merging rule for abutting intervals", required = false) + public IntervalMergingRule intervalMerging = IntervalMergingRule.ALL; + + /** + * Use this to add padding to the intervals specified using -L and/or -XL. For example, '-L chr1:100' with a + * padding value of 20 would turn into '-L chr1:80-120'. This is typically used to add padding around exons when + * analyzing exomes. The general Broad exome calling pipeline uses 100 bp padding by default. + */ + @Argument(fullName = "interval_padding", shortName = "ip", doc = "Amount of padding (in bp) to add to each interval", required = false, minValue = 0) + public int intervalPadding = 0; +} diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/IntervalBinding.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/IntervalBinding.java new file mode 100644 index 000000000..de57de871 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/IntervalBinding.java @@ -0,0 +1,106 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.commandline; + +import com.google.java.contract.Requires; +import org.broad.tribble.AbstractFeatureReader; +import org.broad.tribble.Feature; +import org.broad.tribble.FeatureCodec; +import org.broad.tribble.FeatureReader; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; +import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.interval.IntervalUtils; + +import java.util.*; + +/** + * An IntervalBinding representing a walker argument that gets bound to either a ROD track or interval string. + * + * The IntervalBinding is a formal GATK argument that bridges between a walker and + * the engine to construct intervals for traversal at runtime. The IntervalBinding can + * either be a RodBinding, a string of one interval, or a file with interval strings. + * The GATK Engine takes care of initializing the binding when appropriate and determining intervals from it. + * + * Note that this class is immutable. + */ +public final class IntervalBinding { + + private RodBinding featureIntervals; + private String stringIntervals; + + @Requires({"type != null", "rawName != null", "source != null", "tribbleType != null", "tags != null"}) + public IntervalBinding(Class type, final String rawName, final String source, final String tribbleType, final Tags tags) { + featureIntervals = new RodBinding<>(type, rawName, source, tribbleType, tags); + } + + @Requires({"intervalArgument != null"}) + public IntervalBinding(String intervalArgument) { + stringIntervals = intervalArgument; + } + + public String getSource() { + return ( featureIntervals != null ? featureIntervals.getSource() : stringIntervals ); + } + + public List getIntervals(final GenomeAnalysisEngine toolkit) { + return getIntervals(toolkit.getGenomeLocParser()); + } + + public List getIntervals(final GenomeLocParser genomeLocParser) { + List intervals; + + if ( featureIntervals != null ) { + intervals = new ArrayList<>(); + + // TODO -- after ROD system cleanup, go through the ROD system so that we can handle things like gzipped files + + final FeatureCodec codec = new FeatureManager().getByName(featureIntervals.getTribbleType()).getCodec(); + if ( codec instanceof ReferenceDependentFeatureCodec ) + ((ReferenceDependentFeatureCodec)codec).setGenomeLocParser(genomeLocParser); + try { + FeatureReader reader = AbstractFeatureReader.getFeatureReader(featureIntervals.getSource(), codec, false); + for ( Feature feature : reader.iterator() ) + intervals.add(genomeLocParser.createGenomeLoc(feature)); + } catch (Exception e) { + throw new UserException.MalformedFile(featureIntervals.getSource(), "Problem reading the interval file", e); + } + + } else { + intervals = IntervalUtils.parseIntervalArguments(genomeLocParser, stringIntervals); + } + + Collections.sort(intervals); + return intervals; + } + + public String toString() { + return getSource(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/commandline/MissingArgumentValueException.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/MissingArgumentValueException.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/MissingArgumentValueException.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/MissingArgumentValueException.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/Output.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/Output.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/Output.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/Output.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/ParsedArgs.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ParsedArgs.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ParsedArgs.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ParsedArgs.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/ParsedListArgs.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ParsedListArgs.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ParsedListArgs.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ParsedListArgs.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ParsingEngine.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ParsingEngine.java new file mode 100644 index 000000000..ad64aaa1d --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ParsingEngine.java @@ -0,0 +1,829 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.commandline; + +import com.google.java.contract.Requires; +import org.apache.commons.io.FileUtils; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.classloader.JVMUtils; +import org.broadinstitute.sting.utils.classloader.PluginManager; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.help.ApplicationDetails; +import org.broadinstitute.sting.utils.help.HelpFormatter; + +import java.io.File; +import java.io.IOException; +import java.lang.annotation.Annotation; +import java.lang.reflect.Field; +import java.util.*; + +/** + * A parser for Sting command-line arguments. + */ +public class ParsingEngine { + + /** + * The loaded argument sources along with their back definitions. + */ + private Map argumentSourcesByDefinition = new HashMap(); + + /** + * A list of defined arguments against which command lines are matched. + * Package protected for testing access. + */ + public ArgumentDefinitions argumentDefinitions = new ArgumentDefinitions(); + + /** + * A list of matches from defined arguments to command-line text. + * Indicates as best as possible where command-line text remains unmatched + * to existing arguments. + */ + private ArgumentMatches argumentMatches = null; + + /** + * Techniques for parsing and for argument lookup. + */ + private List parsingMethods = new ArrayList(); + + /** + * All of the RodBinding objects we've seen while parsing + */ + private List rodBindings = new ArrayList(); + + /** + * Class reference to the different types of descriptors that the create method can create. + * The type of set used must be ordered (but not necessarily sorted). + */ + private static final Set STANDARD_ARGUMENT_TYPE_DESCRIPTORS = new LinkedHashSet( Arrays.asList(new SimpleArgumentTypeDescriptor(), + new IntervalBindingArgumentTypeDescriptor(), + new RodBindingArgumentTypeDescriptor(), + new RodBindingCollectionArgumentTypeDescriptor(), + new CompoundArgumentTypeDescriptor(), + new MultiplexArgumentTypeDescriptor()) ); + + private Set argumentTypeDescriptors = new LinkedHashSet(); + + /** + * List of tags associated with the given instantiation of the command-line argument. + */ + private final Map tags = new IdentityHashMap(); + + private PluginManager argumentProviderPluginManager = + new PluginManager(ParsingEngineArgumentProvider.class); + + /** + * our log, which we want to capture anything from org.broadinstitute.sting + */ + protected static Logger logger = Logger.getLogger(ParsingEngine.class); + + public ParsingEngine( CommandLineProgram clp ) { + RodBinding.resetNameCounter(); + parsingMethods.add( ParsingMethod.FullNameParsingMethod ); + parsingMethods.add( ParsingMethod.ShortNameParsingMethod ); + + // Order matters here! Make sure the clp's new type descriptors go in before the original type descriptors. + if(clp != null) + argumentTypeDescriptors.addAll(clp.getArgumentTypeDescriptors()); + argumentTypeDescriptors.addAll(STANDARD_ARGUMENT_TYPE_DESCRIPTORS); + + List> providers = argumentProviderPluginManager.getPlugins(); + for (Class provider: providers) { + addArgumentSource(provider); + } + } + + /** + * Add a main argument source. Argument sources are expected to have + * any number of fields with an @Argument annotation attached. + * @param source An argument source from which to extract command-line arguments. + */ + public void addArgumentSource( Class source ) { + addArgumentSource(null, source); + } + + public ArgumentMatches getArgumentMatches() { + return argumentMatches; + } + + /** + * Add an argument source. Argument sources are expected to have + * any number of fields with an @Argument annotation attached. + * @param sourceName name for this argument source. 'Null' indicates that this source should be treated + * as the main module. + * @param sourceClass A class containing argument sources from which to extract command-line arguments. + */ + public void addArgumentSource( String sourceName, Class sourceClass ) { + List argumentsFromSource = new ArrayList(); + for( ArgumentSource argumentSource: extractArgumentSources(sourceClass) ) { + List argumentDefinitions = argumentSource.createArgumentDefinitions(); + for(ArgumentDefinition argumentDefinition: argumentDefinitions) { + argumentSourcesByDefinition.put(argumentDefinition,argumentSource); + argumentsFromSource.add( argumentDefinition ); + } + } + argumentDefinitions.add( new ArgumentDefinitionGroup(sourceName, argumentsFromSource) ); + } + + /** + * Do a cursory search to see if an argument with the given name is present. + * @param argumentFullName full name of the argument. + * @return True if the argument is present. False otherwise. + */ + public boolean isArgumentPresent( String argumentFullName ) { + ArgumentDefinition definition = + argumentDefinitions.findArgumentDefinition(argumentFullName,ArgumentDefinitions.FullNameDefinitionMatcher); + return argumentMatches.hasMatch(definition); + + } + + /** + * Parse the given set of command-line arguments, returning + * an ArgumentMatches object describing the best fit of these + * command-line arguments to the arguments that are actually + * required. + * @param tokens Tokens passed on the command line. + * @return The parsed arguments by file. + */ + public SortedMap parse( String[] tokens ) { + argumentMatches = new ArgumentMatches(); + SortedMap parsedArgs = new TreeMap(); + + List cmdLineTokens = Arrays.asList(tokens); + parse(ArgumentMatchSource.COMMAND_LINE, cmdLineTokens, argumentMatches, parsedArgs); + + List providers = argumentProviderPluginManager.createAllTypes(); + + for (ParsingEngineArgumentProvider provider: providers) { + // Load the arguments ONLY into the provider. + // Validation may optionally run on the rest of the arguments. + loadArgumentsIntoObject(provider); + } + + for (ParsingEngineArgumentProvider provider: providers) { + provider.parse(this, parsedArgs); + } + + return parsedArgs; + } + + public void parse(ArgumentMatchSource matchSource, List tokens, + ArgumentMatches argumentMatches, SortedMap parsedArgs) { + ArgumentMatchSite lastArgumentMatchSite = new ArgumentMatchSite(matchSource, -1); + + int i = 0; + for (String token: tokens) { + // If the token is of argument form, parse it into its own argument match. + // Otherwise, pair it with the most recently used argument discovered. + ArgumentMatchSite site = new ArgumentMatchSite(matchSource, i); + if( isArgumentForm(token) ) { + ArgumentMatch argumentMatch = parseArgument( token, site ); + if( argumentMatch != null ) { + argumentMatches.mergeInto( argumentMatch ); + lastArgumentMatchSite = site; + } + } + else { + if( argumentMatches.hasMatch(lastArgumentMatchSite) && + !argumentMatches.getMatch(lastArgumentMatchSite).hasValueAtSite(lastArgumentMatchSite)) + argumentMatches.getMatch(lastArgumentMatchSite).addValue( lastArgumentMatchSite, new ArgumentMatchStringValue(token) ); + else + argumentMatches.MissingArgument.addValue( site, new ArgumentMatchStringValue(token) ); + + } + i++; + } + + parsedArgs.put(matchSource, new ParsedListArgs(tokens)); + } + + public void parsePairs(ArgumentMatchSource matchSource, List> tokens, + ArgumentMatches argumentMatches, ParsedArgs matchSourceArgs, + SortedMap parsedArgs) { + int i = 0; + for (Pair pair: tokens) { + + ArgumentMatchSite site = new ArgumentMatchSite(matchSource, i); + List matchers = Arrays.asList(ArgumentDefinitions.FullNameDefinitionMatcher, ArgumentDefinitions.ShortNameDefinitionMatcher); + ArgumentDefinition definition = null; + for (DefinitionMatcher matcher: matchers) { + definition = argumentDefinitions.findArgumentDefinition( pair.getFirst(), matcher ); + if (definition != null) + break; + } + if (definition == null) + continue; + ArgumentMatch argumentMatch = new ArgumentMatch(pair.getFirst(), definition, site, new Tags()); + argumentMatches.mergeInto(argumentMatch); + argumentMatch.addValue(site, pair.getSecond()); + i++; + } + + parsedArgs.put(matchSource, matchSourceArgs); + } + + protected List getArguments(File file) { + try { + if (file.getAbsolutePath().endsWith(".list")) { + return getListArguments(file); + } + } catch (IOException e) { + throw new UserException.CouldNotReadInputFile(file, e); + } + throw new UserException.CouldNotReadInputFile(file, "file extension is not .list"); + } + + private List getListArguments(File file) throws IOException { + ArrayList argsList = new ArrayList(); + for (String line: FileUtils.readLines(file)) + argsList.addAll(Arrays.asList(Utils.escapeExpressions(line))); + return argsList; + } + + public enum ValidationType { MissingRequiredArgument, + InvalidArgument, + InvalidArgumentValue, + ValueMissingArgument, + TooManyValuesForArgument, + MutuallyExclusive } + + /** + * Validates the list of command-line argument matches. + */ + public void validate() { + validate( EnumSet.noneOf(ValidationType.class) ); + } + + /** + * Validates the list of command-line argument matches. On failure throws an exception with detailed info about the + * particular failures. Takes an EnumSet indicating which validation checks to skip. + * @param skipValidationOf List of validation checks to skip. + */ + public void validate( EnumSet skipValidationOf ) { + // Find missing required arguments. + if( !skipValidationOf.contains(ValidationType.MissingRequiredArgument) ) { + Collection requiredArguments = + argumentDefinitions.findArgumentDefinitions( true, ArgumentDefinitions.RequiredDefinitionMatcher ); + Collection missingArguments = new ArrayList(); + for( ArgumentDefinition requiredArgument: requiredArguments ) { + if( !argumentMatches.hasMatch(requiredArgument) ) + missingArguments.add( requiredArgument ); + } + + if( missingArguments.size() > 0 ) + throw new MissingArgumentException( missingArguments ); + } + + // Find invalid arguments. Invalid arguments will have a null argument definition. + if( !skipValidationOf.contains(ValidationType.InvalidArgument) ) { + ArgumentMatches invalidArguments = argumentMatches.findUnmatched(); + if( invalidArguments.size() > 0 ) + throw new InvalidArgumentException( invalidArguments ); + } + + // Find invalid argument values -- invalid arguments are either completely missing or fail the specified 'validation' regular expression. + if( !skipValidationOf.contains(ValidationType.InvalidArgumentValue) ) { + Collection verifiableArguments = + argumentDefinitions.findArgumentDefinitions( null, ArgumentDefinitions.VerifiableDefinitionMatcher ); + Collection> invalidValues = new ArrayList>(); + for( ArgumentDefinition verifiableArgument: verifiableArguments ) { + ArgumentMatches verifiableMatches = argumentMatches.findMatches( verifiableArgument ); + // Check to see whether an argument value was specified. Argument values must be provided + // when the argument name is specified and the argument is not a flag type. + for(ArgumentMatch verifiableMatch: verifiableMatches) { + ArgumentSource argumentSource = argumentSourcesByDefinition.get(verifiableArgument); + if(verifiableMatch.values().size() == 0 && !verifiableArgument.isFlag && argumentSource.createsTypeDefault()) + invalidValues.add(new Pair(verifiableArgument,null)); + } + + // Ensure that the field contents meet the validation criteria specified by the regular expression. + for( ArgumentMatch verifiableMatch: verifiableMatches ) { + for( ArgumentMatchValue value: verifiableMatch.values() ) { + if( verifiableArgument.validation != null && !value.asString().matches(verifiableArgument.validation) ) + invalidValues.add( new Pair(verifiableArgument, value.asString()) ); + } + } + } + + if( invalidValues.size() > 0 ) + throw new InvalidArgumentValueException( invalidValues ); + } + + // Find values without an associated mate. + if( !skipValidationOf.contains(ValidationType.ValueMissingArgument) ) { + if( argumentMatches.MissingArgument.values().size() > 0 ) + throw new UnmatchedArgumentException( argumentMatches.MissingArgument ); + } + + // Find arguments with too many values. + if( !skipValidationOf.contains(ValidationType.TooManyValuesForArgument)) { + Collection overvaluedArguments = new ArrayList(); + for( ArgumentMatch argumentMatch: argumentMatches.findSuccessfulMatches() ) { + // Warning: assumes that definition is not null (asserted by checks above). + if( !argumentMatch.definition.isMultiValued && argumentMatch.values().size() > 1 ) + overvaluedArguments.add(argumentMatch); + } + + if( !overvaluedArguments.isEmpty() ) + throw new TooManyValuesForArgumentException(overvaluedArguments); + } + + // Find sets of options that are supposed to be mutually exclusive. + if( !skipValidationOf.contains(ValidationType.MutuallyExclusive)) { + Collection> invalidPairs = new ArrayList>(); + for( ArgumentMatch argumentMatch: argumentMatches.findSuccessfulMatches() ) { + if( argumentMatch.definition.exclusiveOf != null ) { + for( ArgumentMatch conflictingMatch: argumentMatches.findSuccessfulMatches() ) { + // Skip over the current element. + if( argumentMatch == conflictingMatch ) + continue; + if( argumentMatch.definition.exclusiveOf.equals(conflictingMatch.definition.fullName) || + argumentMatch.definition.exclusiveOf.equals(conflictingMatch.definition.shortName)) + invalidPairs.add( new Pair(argumentMatch, conflictingMatch) ); + } + } + } + + if( !invalidPairs.isEmpty() ) + throw new ArgumentsAreMutuallyExclusiveException( invalidPairs ); + } + } + + /** + * Loads a set of matched command-line arguments into the given object. + * @param object Object into which to add arguments. + */ + public void loadArgumentsIntoObject( Object object ) { + loadArgumentsIntoObject(object, true); + } + + /** + * Loads a set of matched command-line arguments into the given object. + * @param object Object into which to add arguments. + * @param enforceArgumentRanges If true, check that the argument value is within the range specified + * in the corresponding Argument annotation by min/max value attributes. This + * check is only performed for numeric types, and only when a min and/or + * max value is actually defined in the annotation. It is also only performed + * for values actually specified on the command line, and not for default values. + */ + public void loadArgumentsIntoObject( Object object, boolean enforceArgumentRanges ) { + List argumentSources = extractArgumentSources(object.getClass()); + + List dependentArguments = new ArrayList(); + + for( ArgumentSource argumentSource: argumentSources ) { + if(argumentSource.isDeprecated() && argumentMatches.findMatches(this,argumentSource).size() > 0) + notifyDeprecatedCommandLineArgument(argumentSource); + + // If this argument source depends on other command-line arguments, skip it and make a note to process it later. + if(argumentSource.isDependent()) { + dependentArguments.add(argumentSource); + continue; + } + loadValueIntoObject(argumentSource, object, argumentMatches.findMatches(this,argumentSource), enforceArgumentRanges); + } + + for(ArgumentSource dependentArgument: dependentArguments) { + MultiplexArgumentTypeDescriptor dependentDescriptor = dependentArgument.createDependentTypeDescriptor(this,object); + ArgumentSource dependentSource = dependentArgument.copyWithCustomTypeDescriptor(dependentDescriptor); + loadValueIntoObject(dependentSource,object,argumentMatches.findMatches(this,dependentSource), enforceArgumentRanges); + } + } + + /** + * Notify the user that tags have been created. + * @param key The key created. + * @param tags List of tags, or empty list if no tags are present. + */ + public void addTags(Object key, final Tags tags) { + this.tags.put(key,tags); + } + + /** + * Gets the tags associated with a given object. + * @param key Key for which to find a tag. + * @return List of tags associated with this key. + */ + public Tags getTags(Object key) { + if(!tags.containsKey(key)) + return new Tags(); + return tags.get(key); + } + + /** + * Add a RodBinding type argument to this parser. Called during parsing to allow + * us to track all of the RodBindings discovered in the command line. + * @param rodBinding the rodbinding to add. Must not be added twice + */ + @Requires("rodBinding != null") + public void addRodBinding(final RodBinding rodBinding) { + rodBindings.add(rodBinding); + } + + /** + * Notify the user that a deprecated command-line argument has been used. + * @param argumentSource Deprecated argument source specified by user. + */ + private void notifyDeprecatedCommandLineArgument(ArgumentSource argumentSource) { + // Grab the first argument definition and report that one as the failure. Theoretically, we should notify of all failures. + List definitions = argumentSource.createArgumentDefinitions(); + if(definitions.size() < 1) + throw new ReviewedStingException("Internal error. Argument source creates no definitions."); + ArgumentDefinition definition = definitions.get(0); + throw new UserException.DeprecatedArgument(definition.fullName,definition.doc); + } + + /** + * Loads a single argument into the object and that objects children. + * @param argumentMatches Argument matches to load into the object. + * @param source Argument source to load into the object. + * @param instance Object into which to inject the value. The target might be in a container within the instance. + * @param enforceArgumentRanges If true, check that the argument value is within the range specified + * in the corresponding Argument annotation by min/max value attributes. This + * check is only performed for numeric types, and only when a min and/or + * max value is actually defined in the annotation. It is also only performed + * for values actually specified on the command line, and not for default values. + */ + private void loadValueIntoObject( ArgumentSource source, Object instance, ArgumentMatches argumentMatches, boolean enforceArgumentRanges ) { + // Nothing to load + if( argumentMatches.size() == 0 && ! source.createsTypeDefault() ) + return; + + // Target instance into which to inject the value. + Collection targets = findTargets( source, instance ); + + // Abort if no home is found for the object. + if( targets.size() == 0 ) + throw new ReviewedStingException("Internal command-line parser error: unable to find a home for argument matches " + argumentMatches); + + for( Object target: targets ) { + Object value; + boolean usedTypeDefault = false; + if ( argumentMatches.size() != 0 ) { + value = source.parse(this,argumentMatches); + } + else { + value = source.createTypeDefault(this); + usedTypeDefault = true; + } + + // Only check argument ranges if a check was requested AND we used a value from the command line rather + // than the type default + if ( enforceArgumentRanges && ! usedTypeDefault ) { + checkArgumentRange(source, value); + } + + JVMUtils.setFieldValue(source.field,target,value); + } + } + + /** + * Check the provided value against any range constraints specified in the Argument annotation + * for the corresponding field. Throw an exception if hard limits are violated, or emit a warning + * if soft limits are violated. + * + * Only checks numeric types (int, double, etc.) + * Only checks fields with an actual @Argument annotation + * Only checks manually-specified constraints (there are no default constraints). + * + * @param argumentSource The source field for the command-line argument + * @param argumentValue The value we're considering putting in that source field + */ + private void checkArgumentRange( final ArgumentSource argumentSource, final Object argumentValue ) { + // Only validate numeric types + if ( ! (argumentValue instanceof Number) ) { + return; + } + final double argumentDoubleValue = ((Number)argumentValue).doubleValue(); + + // Only validate fields with an @Argument annotation + final Annotation argumentAnnotation = argumentSource.field.getAnnotation(Argument.class); + if ( argumentAnnotation == null ) { + return; + } + + final double minValue = (Double)CommandLineUtils.getValue(argumentAnnotation, "minValue"); + final double maxValue = (Double)CommandLineUtils.getValue(argumentAnnotation, "maxValue"); + final double minRecommendedValue = (Double)CommandLineUtils.getValue(argumentAnnotation, "minRecommendedValue"); + final double maxRecommendedValue = (Double)CommandLineUtils.getValue(argumentAnnotation, "maxRecommendedValue"); + final String argumentName = (String)CommandLineUtils.getValue(argumentAnnotation, "fullName"); + + // Check hard limits first, if specified + if ( minValue != Double.NEGATIVE_INFINITY && argumentDoubleValue < minValue ) { + throw new ArgumentValueOutOfRangeException(argumentName, argumentDoubleValue, minValue, "minimum"); + } + + if ( maxValue != Double.POSITIVE_INFINITY && argumentDoubleValue > maxValue ) { + throw new ArgumentValueOutOfRangeException(argumentName, argumentDoubleValue, maxValue, "maximum"); + } + + // Then check soft limits, if specified + if ( minRecommendedValue != Double.NEGATIVE_INFINITY && argumentDoubleValue < minRecommendedValue ) { + logger.warn(String.format("WARNING: argument --%s has value %.2f, but minimum recommended value is %.2f", + argumentName, argumentDoubleValue, minRecommendedValue)); + } + + if ( maxRecommendedValue != Double.POSITIVE_INFINITY && argumentDoubleValue > maxRecommendedValue ) { + logger.warn(String.format("WARNING: argument --%s has value %.2f, but maximum recommended value is %.2f", + argumentName, argumentDoubleValue, maxRecommendedValue)); + } + } + + public Collection getRodBindings() { + return Collections.unmodifiableCollection(rodBindings); + } + + /** + * Gets a collection of the container instances of the given type stored within the given target. + * @param source Argument source. + * @param instance Container. + * @return A collection of containers matching the given argument source. + */ + private Collection findTargets(ArgumentSource source, Object instance) { + LinkedHashSet targets = new LinkedHashSet(); + for( Class clazz = instance.getClass(); clazz != null; clazz = clazz.getSuperclass() ) { + for( Field field: clazz.getDeclaredFields() ) { + if( field.equals(source.field) ) { + targets.add(instance); + } else if( field.isAnnotationPresent(ArgumentCollection.class) ) { + targets.addAll(findTargets(source, JVMUtils.getFieldValue(field, instance))); + } + } + } + return targets; + } + + /** + * Prints out the help associated with these command-line argument definitions. + * @param applicationDetails Details about the specific GATK-based application being run. + */ + public void printHelp( ApplicationDetails applicationDetails ) { + new HelpFormatter().printHelp(applicationDetails,argumentDefinitions); + } + + /** + * Extract all the argument sources from a given object. + * @param sourceClass class to act as sources for other arguments. + * @return A list of sources associated with this object and its aggregated objects. + */ + public List extractArgumentSources(Class sourceClass) { + return extractArgumentSources(sourceClass, new Field[0]); + } + + /** + * Fetch the best command-line argument descriptor for the given class. + * @param type Class for which to specify a descriptor. + * @return descriptor for the given type. + */ + public ArgumentTypeDescriptor selectBestTypeDescriptor(Class type) { + return ArgumentTypeDescriptor.selectBest(argumentTypeDescriptors,type); + } + + private List extractArgumentSources(Class sourceClass, Field[] parentFields) { + // now simply call into the truly general routine extract argument bindings but with a null + // object so bindings aren't computed + Map bindings = extractArgumentBindings(null, sourceClass, parentFields); + return new ArrayList(bindings.keySet()); + } + + public Map extractArgumentBindings(Object obj) { + if ( obj == null ) throw new IllegalArgumentException("Incoming object cannot be null"); + return extractArgumentBindings(obj, obj.getClass(), new Field[0]); + } + + /** + * Extract all the argument sources from a given object, along with their bindings if obj != null . + * @param obj the object corresponding to the sourceClass + * @param sourceClass class to act as sources for other arguments. + * @param parentFields Parent Fields + * @return A map of sources associated with this object and its aggregated objects and bindings to their bindings values + */ + private Map extractArgumentBindings(Object obj, Class sourceClass, Field[] parentFields) { + Map bindings = new LinkedHashMap(); + + while( sourceClass != null ) { + Field[] fields = sourceClass.getDeclaredFields(); + for( Field field: fields ) { + if( ArgumentTypeDescriptor.isArgumentAnnotationPresent(field) ) { + Object val = obj != null ? JVMUtils.getFieldValue(field, obj) : null; + bindings.put( new ArgumentSource(parentFields, field, selectBestTypeDescriptor(field.getType())), val ); + } + if( field.isAnnotationPresent(ArgumentCollection.class) ) { + Object val = obj != null ? JVMUtils.getFieldValue(field, obj) : null; + Field[] newParentFields = Arrays.copyOf(parentFields, parentFields.length + 1); + newParentFields[parentFields.length] = field; + bindings.putAll( extractArgumentBindings(val, field.getType(), newParentFields) ); + } + } + + sourceClass = sourceClass.getSuperclass(); + } + + return bindings; + } + + /** + * Determines whether a token looks like the name of an argument. + * @param token Token to inspect. Can be surrounded by whitespace. + * @return True if token is of short name form. + */ + private boolean isArgumentForm( String token ) { + for( ParsingMethod parsingMethod: parsingMethods ) { + if( parsingMethod.matches(token) ) + return true; + } + + return false; + } + + /** + * Parse a short name into an ArgumentMatch. + * @param token The token to parse. The token should pass the isLongArgumentForm test. + * @param position The position of the token in question. + * @return ArgumentMatch associated with this token, or null if no match exists. + */ + private ArgumentMatch parseArgument( String token, ArgumentMatchSite position ) { + if( !isArgumentForm(token) ) + throw new IllegalArgumentException( "Token is not recognizable as an argument: " + token ); + + for( ParsingMethod parsingMethod: parsingMethods ) { + if( parsingMethod.matches( token ) ) + return parsingMethod.match( argumentDefinitions, token, position ); + } + + // No parse results found. + return null; + } +} + +/** + * An exception indicating that some required arguments are missing. + */ +class MissingArgumentException extends ArgumentException { + public MissingArgumentException( Collection missingArguments ) { + super( formatArguments(missingArguments) ); + } + + private static String formatArguments( Collection missingArguments ) { + StringBuilder sb = new StringBuilder(); + for( ArgumentDefinition missingArgument: missingArguments ) { + if( missingArgument.shortName != null ) + sb.append( String.format("%nArgument with name '--%s' (-%s) is missing.", missingArgument.fullName, missingArgument.shortName) ); + else + sb.append( String.format("%nArgument with name '--%s' is missing.", missingArgument.fullName) ); + } + return sb.toString(); + } +} + +/** + * An exception for undefined arguments. + */ +class InvalidArgumentException extends ArgumentException { + public InvalidArgumentException( ArgumentMatches invalidArguments ) { + super( formatArguments(invalidArguments) ); + } + + private static String formatArguments( ArgumentMatches invalidArguments ) { + StringBuilder sb = new StringBuilder(); + for( ArgumentMatch invalidArgument: invalidArguments ) + sb.append( String.format("%nArgument with name '%s' isn't defined.", invalidArgument.label) ); + return sb.toString(); + } +} + +/** + * An exception for values whose format is invalid. + */ +class InvalidArgumentValueException extends ArgumentException { + public InvalidArgumentValueException( Collection> invalidArgumentValues ) { + super( formatArguments(invalidArgumentValues) ); + } + + private static String formatArguments( Collection> invalidArgumentValues ) { + StringBuilder sb = new StringBuilder(); + for( Pair invalidValue: invalidArgumentValues ) { + if(invalidValue.getSecond() == null) + sb.append( String.format("%nArgument '--%s' requires a value but none was provided", + invalidValue.first.fullName) ); + else + sb.append( String.format("%nArgument '--%s' has value of incorrect format: %s (should match %s)", + invalidValue.first.fullName, + invalidValue.second, + invalidValue.first.validation) ); + } + return sb.toString(); + } +} + +class ArgumentValueOutOfRangeException extends ArgumentException { + public ArgumentValueOutOfRangeException( final String argumentName, final double argumentActualValue, + final double argumentBoundaryValue, final String argumentBoundaryType ) { + super(String.format("Argument --%s has value %.2f, but %s allowed value is %.2f", + argumentName, argumentActualValue, argumentBoundaryType, argumentBoundaryValue)); + } +} + +/** + * An exception for values that can't be mated with any argument. + */ +class UnmatchedArgumentException extends ArgumentException { + public UnmatchedArgumentException( ArgumentMatch invalidValues ) { + super( formatArguments(invalidValues) ); + } + + private static String formatArguments( ArgumentMatch invalidValues ) { + StringBuilder sb = new StringBuilder(); + for( ArgumentMatchSite site: invalidValues.sites.keySet() ) + for( ArgumentMatchValue value: invalidValues.sites.get(site) ) { + switch (site.getSource().getType()) { + case CommandLine: + sb.append( String.format("%nInvalid argument value '%s' at position %d.", + value.asString(), site.getIndex()) ); + break; + case Provider: + sb.append( String.format("%nInvalid argument value '%s' in %s at position %d.", + value.asString(), site.getSource().getDescription(), site.getIndex()) ); + break; + default: + throw new RuntimeException( String.format("Unexpected argument match source type: %s", + site.getSource().getType())); + } + if(value.asString() != null && Utils.dupString(' ',value.asString().length()).equals(value.asString())) + sb.append(" Please make sure any line continuation backslashes on your command line are not followed by whitespace."); + } + return sb.toString(); + } +} + +/** + * An exception indicating that too many values have been provided for the given argument. + */ +class TooManyValuesForArgumentException extends ArgumentException { + public TooManyValuesForArgumentException( Collection arguments ) { + super( formatArguments(arguments) ); + } + + private static String formatArguments( Collection arguments ) { + StringBuilder sb = new StringBuilder(); + for( ArgumentMatch argument: arguments ) + sb.append( String.format("%nArgument '%s' has too many values: %s.", argument.label, Arrays.deepToString(argument.values().toArray())) ); + return sb.toString(); + } +} + +/** + * An exception indicating that mutually exclusive options have been passed in the same command line. + */ +class ArgumentsAreMutuallyExclusiveException extends ArgumentException { + public ArgumentsAreMutuallyExclusiveException( Collection> arguments ) { + super( formatArguments(arguments) ); + } + + private static String formatArguments( Collection> arguments ) { + StringBuilder sb = new StringBuilder(); + for( Pair argument: arguments ) + sb.append( String.format("%nArguments '%s' and '%s' are mutually exclusive.", argument.first.definition.fullName, argument.second.definition.fullName ) ); + return sb.toString(); + } + +} + + +/** + * An exception for when an argument doesn't match an of the enumerated options for that var type + */ +class UnknownEnumeratedValueException extends ArgumentException { + public UnknownEnumeratedValueException(ArgumentDefinition definition, String argumentPassed) { + super( formatArguments(definition,argumentPassed) ); + } + + private static String formatArguments(ArgumentDefinition definition, String argumentPassed) { + return String.format("Invalid value %s specified for argument %s; valid options are (%s).", argumentPassed, definition.fullName, Utils.join(",",definition.validOptions)); + } +} diff --git a/public/java/src/org/broadinstitute/sting/commandline/ParsingEngineArgumentFiles.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ParsingEngineArgumentFiles.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ParsingEngineArgumentFiles.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ParsingEngineArgumentFiles.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/ParsingEngineArgumentProvider.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ParsingEngineArgumentProvider.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ParsingEngineArgumentProvider.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ParsingEngineArgumentProvider.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/ParsingMethod.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ParsingMethod.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/ParsingMethod.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ParsingMethod.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/RodBinding.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/RodBinding.java new file mode 100644 index 000000000..87fa85858 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/RodBinding.java @@ -0,0 +1,197 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.commandline; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broad.tribble.Feature; + +import java.util.*; + +/** + * A RodBinding represents a walker argument that gets bound to a ROD track. + * + * The RodBinding is a formal GATK argument that bridges between a walker and + * the RefMetaDataTracker to obtain data about this rod track at runtime. The RodBinding + * is explicitly typed with type of the Tribble.Feature expected to be produced by this + * argument. The GATK Engine takes care of initializing the binding and connecting it + * to the RMD system. + * + * It is recommended that optional RodBindings be initialized to the value returned + * by the static method makeUnbound(). + * + * Note that this class is immutable. + */ +public final class RodBinding { + protected final static String UNBOUND_VARIABLE_NAME = ""; + protected final static String UNBOUND_SOURCE = "UNBOUND"; + protected final static String UNBOUND_TRIBBLE_TYPE = ""; + + /** + * Create an unbound Rodbinding of type. This is the correct programming + * style for an optional RodBinding + * + * At Input() + * RodBinding x = RodBinding.makeUnbound(T.class) + * + * The unbound binding is guaranteed to never match any binding. It uniquely + * returns false to isBound(). + * + * @param type the Class type produced by this unbound object + * @param any class extending Tribble Feature + * @return the UNBOUND RodBinding producing objects of type T + */ + @Requires("type != null") + protected final static RodBinding makeUnbound(Class type) { + return new RodBinding(type); + } + + /** The name of this binding. Often the name of the field itself, but can be overridden on cmdline */ + final private String name; + /** where the data for this ROD is coming from. A file or special value if coming from stdin */ + final private String source; + /** the string name of the tribble type, such as vcf, bed, etc. */ + final private String tribbleType; + /** The command line tags associated with this RodBinding */ + final private Tags tags; + /** The Java class expected for this RodBinding. Must correspond to the type emitted by Tribble */ + final private Class type; + /** True for all RodBindings except the special UNBOUND binding, which is the default for optional arguments */ + final private boolean bound; + + /** + * The name counter. This is how we create unique names for collections of RodBindings + * on the command line. If you have provide the GATK with -X file1 and -X file2 to a + * RodBinding argument as List> then each binding will receive automatically + * the name of X and X2. + */ + final private static Map nameCounter = new HashMap(); + + /** for UnitTests */ + final public static void resetNameCounter() { + nameCounter.clear(); + } + + @Requires("rawName != null") + @Ensures("result != null") + final private static synchronized String countedVariableName(final String rawName) { + Integer count = nameCounter.get(rawName); + if ( count == null ) { + nameCounter.put(rawName, 1); + return rawName; + } else { + nameCounter.put(rawName, count + 1); + return rawName + (count + 1); + } + } + + @Requires({"type != null", "rawName != null", "source != null", "tribbleType != null", "tags != null"}) + public RodBinding(Class type, final String rawName, final String source, final String tribbleType, final Tags tags) { + this.type = type; + this.name = countedVariableName(rawName); + this.source = source; + this.tribbleType = tribbleType; + this.tags = tags; + this.bound = true; + } + + /** + * For testing purposes only. Creates a RodBinding sufficient for looking up associations to rawName + * @param type + * @param rawName + */ + public RodBinding(Class type, final String rawName) { + this(type, rawName, "missing", type.getSimpleName(), new Tags()); + } + + /** + * Make an unbound RodBinding. Only available for creating the globally unique UNBOUND object + * @param type class this unbound RodBinding creates + */ + @Requires({"type != null"}) + private RodBinding(Class type) { + this.type = type; + this.name = UNBOUND_VARIABLE_NAME; // special value can never be found in RefMetaDataTracker + this.source = UNBOUND_SOURCE; + this.tribbleType = UNBOUND_TRIBBLE_TYPE; + this.tags = new Tags(); + this.bound = false; + } + + + /** + * @return True for all RodBindings except the special UNBOUND binding, which is the default for optional arguments + */ + final public boolean isBound() { + return bound; + } + + /** + * @return The name of this binding. Often the name of the field itself, but can be overridden on cmdline + */ + @Ensures({"result != null"}) + final public String getName() { + return name; + } + + /** + * @return the string name of the tribble type, such as vcf, bed, etc. + */ + @Ensures({"result != null"}) + final public Class getType() { + return type; + } + + /** + * @return where the data for this ROD is coming from. A file or special value if coming from stdin + */ + @Ensures({"result != null"}) + final public String getSource() { + return source; + } + + /** + * @return The command line tags associated with this RodBinding. Will include the tags used to + * determine the name and type of this RodBinding + */ + @Ensures({"result != null"}) + final public Tags getTags() { + return tags; + } + + /** + * @return The Java class expected for this RodBinding. Must correspond to the type emited by Tribble + */ + @Ensures({"result != null"}) + final public String getTribbleType() { + return tribbleType; + } + + @Override + public String toString() { + return String.format("(RodBinding name=%s source=%s)", getName(), getSource()); + } +} diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/RodBindingCollection.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/RodBindingCollection.java new file mode 100644 index 000000000..d8306ea5a --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/RodBindingCollection.java @@ -0,0 +1,89 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.commandline; + +import com.google.java.contract.Ensures; +import org.broad.tribble.Feature; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.lang.reflect.Constructor; +import java.lang.reflect.InvocationTargetException; +import java.util.*; + +/** + * A RodBindingCollection represents a collection of RodBindings. + * + * The RodBindingCollection is a formal GATK argument that is used to specify a file of RodBindings. + * + */ +public final class RodBindingCollection { + + /** The Java class expected for this RodBinding. Must correspond to the type emitted by Tribble */ + final private Class type; + + private Collection> rodBindings; + + public RodBindingCollection(final Class type, final Collection> rodBindings) { + this.type = type; + this.rodBindings = Collections.unmodifiableCollection(rodBindings); + } + + /** + * @return the collection of RodBindings + */ + final public Collection> getRodBindings() { + return rodBindings; + } + + /** + * @return the string name of the tribble type, such as vcf, bed, etc. + */ + @Ensures({"result != null"}) + final public Class getType() { + return type; + } + + @Override + public String toString() { + return String.format("(RodBindingCollection %s)", getRodBindings()); + } + + /** + * Utility method to help construct a RodBindingCollection of the given Feature type + * + * @param type the Feature type + * @param rodBindings the rod bindings to put into the collection + * @return a new RodBindingCollection object + */ + public static Object createRodBindingCollectionOfType(final Class type, final Collection rodBindings) { + try { + final Constructor ctor = RodBindingCollection.class.getConstructor(Class.class, Collection.class); + return ctor.newInstance(type, rodBindings); + } catch (final Exception e) { + throw new IllegalStateException("Failed to create a RodBindingCollection for type " + type); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/commandline/Tags.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/Tags.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/Tags.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/Tags.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/package-info.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/package-info.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/commandline/package-info.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/package-info.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/CommandLineExecutable.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/CommandLineExecutable.java new file mode 100644 index 000000000..86ecaffe0 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/CommandLineExecutable.java @@ -0,0 +1,229 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.commandline.ArgumentTypeDescriptor; +import org.broadinstitute.sting.commandline.CommandLineProgram; +import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; +import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.gatk.filters.ReadFilter; +import org.broadinstitute.sting.gatk.io.stubs.OutputStreamArgumentTypeDescriptor; +import org.broadinstitute.sting.gatk.io.stubs.SAMFileWriterArgumentTypeDescriptor; +import org.broadinstitute.sting.gatk.io.stubs.VCFWriterArgumentTypeDescriptor; +import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; +import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.utils.crypt.CryptUtils; +import org.broadinstitute.sting.utils.crypt.GATKKey; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.text.ListFileUtils; + +import java.security.PublicKey; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; + +/** + * @author aaron + */ +public abstract class CommandLineExecutable extends CommandLineProgram { + /** + * The actual engine which performs the analysis. + */ + protected GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + + // get the analysis name + public abstract String getAnalysisName(); + + /** + * Gets the GATK argument bundle. + * @return A structure consisting of whatever arguments should be used to initialize the GATK engine. + */ + protected abstract GATKArgumentCollection getArgumentCollection(); + + /** + * A list of all the arguments initially used as sources. + */ + private final Collection argumentSources = new ArrayList(); + + protected static Logger logger = Logger.getLogger(CommandLineExecutable.class); + + /** + * this is the function that the inheriting class can expect to have called + * when the command line system has initialized. + * + * @return the return code to exit the program with + */ + protected int execute() throws Exception { + engine.setParser(parser); + argumentSources.add(this); + + Walker walker = engine.getWalkerByName(getAnalysisName()); + + try { + // Make sure a valid GATK user key is present, if required. + authorizeGATKRun(); + + engine.setArguments(getArgumentCollection()); + + // File lists can require a bit of additional expansion. Set these explicitly by the engine. + final Collection bamFileList=ListFileUtils.unpackBAMFileList(getArgumentCollection().samFiles,parser); + engine.setSAMFileIDs(bamFileList); + if(getArgumentCollection().showFullBamList){ + logger.info(String.format("Adding the following input SAM Files: %s",bamFileList.toString())); + } + + engine.setWalker(walker); + walker.setToolkit(engine); + + Collection filters = engine.createFilters(); + engine.setFilters(filters); + + // load the arguments into the walker / filters. + // TODO: The fact that this extra load call exists here when all the parsing happens at the engine + // TODO: level indicates that we're doing something wrong. Turn this around so that the GATK can drive + // TODO: argument processing. + loadArgumentsIntoObject(walker); + argumentSources.add(walker); + + Collection rodBindings = ListFileUtils.unpackRODBindings(parser.getRodBindings(), parser); + engine.setReferenceMetaDataFiles(rodBindings); + + for (ReadFilter filter: filters) { + loadArgumentsIntoObject(filter); + argumentSources.add(filter); + } + + engine.execute(); + generateGATKRunReport(walker); + } catch ( Exception e ) { + generateGATKRunReport(walker, e); + throw e; + } + + // always return 0 + return 0; + } + + /** + * Authorizes this run of the GATK by checking for a valid GATK user key, if required. + * Currently, a key is required only if running with the -et NO_ET or -et STDOUT options. + */ + private void authorizeGATKRun() { + if ( getArgumentCollection().phoneHomeType == GATKRunReport.PhoneHomeOption.NO_ET || + getArgumentCollection().phoneHomeType == GATKRunReport.PhoneHomeOption.STDOUT ) { + if ( getArgumentCollection().gatkKeyFile == null ) { + throw new UserException("Running with the -et NO_ET or -et STDOUT option requires a GATK Key file. " + + "Please see " + UserException.PHONE_HOME_DOCS_URL + + " for more information and instructions on how to obtain a key."); + } + else { + PublicKey gatkPublicKey = CryptUtils.loadGATKDistributedPublicKey(); + GATKKey gatkUserKey = new GATKKey(gatkPublicKey, getArgumentCollection().gatkKeyFile); + + if ( ! gatkUserKey.isValid() ) { + throw new UserException.KeySignatureVerificationException(getArgumentCollection().gatkKeyFile); + } + } + } + } + + /** + * Generate the GATK run report for this walker using the current GATKEngine, if -et is enabled. + * This report will be written to either STDOUT or to the run repository, depending on the options + * for -et. + * + * @param e the exception, can be null if no exception occurred + */ + private void generateGATKRunReport(Walker walker, Exception e) { + if ( getArgumentCollection().phoneHomeType != GATKRunReport.PhoneHomeOption.NO_ET ) { + GATKRunReport report = new GATKRunReport(walker, e, engine, getArgumentCollection().phoneHomeType ); + report.postReport(getArgumentCollection().phoneHomeType); + } + } + + /** + * Convenience method for fully parameterized generateGATKRunReport when an exception has + * not occurred + * + * @param walker + */ + private void generateGATKRunReport(Walker walker) { + generateGATKRunReport(walker, null); + } + + /** + * Subclasses of CommandLinePrograms can provide their own types of command-line arguments. + * @return A collection of type descriptors generating implementation-dependent placeholders. + */ + protected Collection getArgumentTypeDescriptors() { + return Arrays.asList( new VCFWriterArgumentTypeDescriptor(engine,System.out,argumentSources), + new SAMFileWriterArgumentTypeDescriptor(engine,System.out), + new OutputStreamArgumentTypeDescriptor(engine,System.out) ); + } + + /** + * GATK can add arguments dynamically based on analysis type. + * + * @return true + */ + @Override + protected boolean canAddArgumentsDynamically() { + return true; + } + + /** + * GATK provides the walker as an argument source. + * @return List of walkers to load dynamically. + */ + @Override + protected Class[] getArgumentSources() { + // No walker info? No plugins. + if (getAnalysisName() == null) return new Class[] {}; + + Collection argumentSources = new ArrayList(); + + Walker walker = engine.getWalkerByName(getAnalysisName()); + engine.setArguments(getArgumentCollection()); + engine.setWalker(walker); + walker.setToolkit(engine); + argumentSources.add(walker.getClass()); + + Collection filters = engine.createFilters(); + for(ReadFilter filter: filters) + argumentSources.add(filter.getClass()); + + Class[] argumentSourcesAsArray = new Class[argumentSources.size()]; + return argumentSources.toArray(argumentSourcesAsArray); + } + + @Override + protected String getArgumentSourceName( Class argumentSource ) { + return engine.getWalkerName((Class)argumentSource); + } + +} diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/CommandLineGATK.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/CommandLineGATK.java new file mode 100644 index 000000000..728fee5c8 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/CommandLineGATK.java @@ -0,0 +1,385 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk; + +import net.sf.picard.PicardException; +import net.sf.samtools.SAMException; +import org.broad.tribble.TribbleException; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.ArgumentCollection; +import org.broadinstitute.sting.commandline.CommandLineProgram; +import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; +import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager; +import org.broadinstitute.sting.gatk.walkers.Attribution; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.help.*; +import org.broadinstitute.sting.utils.text.TextFormattingUtils; + +import java.util.*; + +/** + * All command line parameters accepted by all tools in the GATK. + * + *

Info for general users

+ * + *

This is a list of options and parameters that are generally available to all tools in the GATK.

+ * + *

There may be a few restrictions, which are indicated in individual argument descriptions. For example the -BQSR + * argument is only meant to be used with a subset of tools, and the -pedigree argument will only be effectively used + * by a subset of tools as well. Some arguments conflict with others, and some conversely are dependent on others. This + * is all indicated in the detailed argument descriptions, so be sure to read those in their entirety rather than just + * skimming the one-line summaey in the table.

+ * + *

Info for developers

+ * + *

This class is the GATK engine itself, which manages map/reduce data access and runs walkers.

+ * + *

We run command line GATK programs using this class. It gets the command line args, parses them, and hands the + * gatk all the parsed out information. Pretty much anything dealing with the underlying system should go here; + * the GATK engine should deal with any data related information.

+ */ +@DocumentedGATKFeature(groupName = HelpConstants.DOCS_CAT_ENGINE) +public class CommandLineGATK extends CommandLineExecutable { + /** + * A complete list of tools (sometimes also called walkers because they "walk" through the data to perform analyses) + * is available in the online documentation. + */ + @Argument(fullName = "analysis_type", shortName = "T", doc = "Name of the tool to run") + private String analysisName = null; + + // our argument collection, the collection of command line args we accept + @ArgumentCollection + private GATKArgumentCollection argCollection = new GATKArgumentCollection(); + + /** + * Get pleasing info about the GATK. + * + * @return A list of Strings that contain pleasant info about the GATK. + */ + @Override + protected ApplicationDetails getApplicationDetails() { + return new ApplicationDetails(createApplicationHeader(), + getAttribution(), + ApplicationDetails.createDefaultRunningInstructions(getClass()), + getAdditionalHelp()); + } + + @Override + public String getAnalysisName() { + return analysisName; + } + + @Override + protected GATKArgumentCollection getArgumentCollection() { + return argCollection; + } + + /** + * Required main method implementation. + */ + public static void main(String[] argv) { + try { + CommandLineGATK instance = new CommandLineGATK(); + start(instance, argv); + System.exit(CommandLineProgram.result); // todo -- this is a painful hack + } catch (UserException e) { + exitSystemWithUserError(e); + } catch (TribbleException e) { + // We can generate Tribble Exceptions in weird places when e.g. VCF genotype fields are + // lazy loaded, so they aren't caught elsewhere and made into User Exceptions + exitSystemWithUserError(e); + } catch(PicardException e) { + // TODO: Should Picard exceptions be, in general, UserExceptions or ReviewedStingExceptions? + exitSystemWithError(e); + } catch (SAMException e) { + checkForMaskedUserErrors(e); + exitSystemWithSamError(e); + } catch (OutOfMemoryError e) { + exitSystemWithUserError(new UserException.NotEnoughMemory()); + } catch (Throwable t) { + checkForMaskedUserErrors(t); + exitSystemWithError(t); + } + } + + public static final String PICARD_TEXT_SAM_FILE_ERROR_1 = "Cannot use index file with textual SAM file"; + public static final String PICARD_TEXT_SAM_FILE_ERROR_2 = "Cannot retrieve file pointers within SAM text files"; + public static final String NO_SPACE_LEFT_ON_DEVICE_ERROR = "No space left on device"; + public static final String DISK_QUOTA_EXCEEDED_ERROR = "Disk quota exceeded"; + + private static void checkForMaskedUserErrors(final Throwable t) { + // masked out of memory error + if ( t instanceof OutOfMemoryError ) + exitSystemWithUserError(new UserException.NotEnoughMemory()); + // masked user error + if ( t instanceof UserException || t instanceof TribbleException ) + exitSystemWithUserError(new UserException(t.getMessage())); + + // no message means no masked error + final String message = t.getMessage(); + if ( message == null ) + return; + + // too many open files error + if ( message.contains("Too many open files") ) + exitSystemWithUserError(new UserException.TooManyOpenFiles()); + + // malformed BAM looks like a SAM file + if ( message.contains(PICARD_TEXT_SAM_FILE_ERROR_1) || message.contains(PICARD_TEXT_SAM_FILE_ERROR_2) ) + exitSystemWithSamError(t); + + // can't close tribble index when writing + if ( message.contains("Unable to close index for") ) + exitSystemWithUserError(new UserException(t.getCause() == null ? message : t.getCause().getMessage())); + + // disk is full + if ( message.contains(NO_SPACE_LEFT_ON_DEVICE_ERROR) || message.contains(DISK_QUOTA_EXCEEDED_ERROR) ) + exitSystemWithUserError(new UserException.NoSpaceOnDevice()); + + // masked error wrapped in another one + if ( t.getCause() != null ) + checkForMaskedUserErrors(t.getCause()); + } + + /** + * Creates the a short blurb about the GATK, copyright info, and where to get documentation. + * + * @return The application header. + */ + public static List createApplicationHeader() { + List header = new ArrayList(); + header.add(String.format("The Genome Analysis Toolkit (GATK) v%s, Compiled %s",getVersionNumber(), getBuildTime())); + header.add("Copyright (c) 2010 The Broad Institute"); + header.add("For support and documentation go to " + HelpConstants.BASE_GATK_URL); + return header; + } + + public static String getVersionNumber() { + ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("StingText"); + return headerInfo.containsKey("org.broadinstitute.sting.gatk.version") ? headerInfo.getString("org.broadinstitute.sting.gatk.version") : ""; + } + + public static String getBuildTime() { + ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("StingText"); + return headerInfo.containsKey("build.timestamp") ? headerInfo.getString("build.timestamp") : ""; + } + + /** + * If the user supplied any additional attribution, return it here. + * @return Additional attribution if supplied by the user. Empty (non-null) list otherwise. + */ + private List getAttribution() { + List attributionLines = new ArrayList(); + + // If no analysis name is present, fill in extra help on the walkers. + WalkerManager walkerManager = engine.getWalkerManager(); + String analysisName = getAnalysisName(); + if(analysisName != null && walkerManager.exists(analysisName)) { + Class walkerType = walkerManager.getWalkerClassByName(analysisName); + if(walkerType.isAnnotationPresent(Attribution.class)) + attributionLines.addAll(Arrays.asList(walkerType.getAnnotation(Attribution.class).value())); + } + return attributionLines; + } + + /** + * Retrieves additional information about GATK walkers. + * the code in HelpFormatter and supply it as a helper to this method. + * + * @return A string summarizing the walkers available in this distribution. + */ + private String getAdditionalHelp() { + String additionalHelp; + + // If no analysis name is present, fill in extra help on the walkers. + WalkerManager walkerManager = engine.getWalkerManager(); + String analysisName = getAnalysisName(); + if(analysisName != null && walkerManager.exists(getAnalysisName())) + additionalHelp = getWalkerHelp(walkerManager.getWalkerClassByName(getAnalysisName())); + else + additionalHelp = getAllWalkerHelp(); + + return additionalHelp; + } + + private static final int PACKAGE_INDENT = 1; + private static final int WALKER_INDENT = 3; + private static final String FIELD_SEPARATOR = " "; + + private String getWalkerHelp(Class walkerType) { + // Construct a help string to output details on this walker. + StringBuilder additionalHelp = new StringBuilder(); + Formatter formatter = new Formatter(additionalHelp); + + formatter.format("Available Reference Ordered Data types:%n"); + formatter.format(new FeatureManager().userFriendlyListOfAvailableFeatures()); + formatter.format("%n"); + + formatter.format("For a full description of this walker, see its GATKdocs at:%n"); + formatter.format("%s%n", GATKDocUtils.helpLinksToGATKDocs(walkerType)); + + return additionalHelp.toString(); + } + + /** + * Load in additional help information about all available walkers. + * @return A string representation of the additional help. + */ + private String getAllWalkerHelp() { + // Construct a help string to output available walkers. + StringBuilder additionalHelp = new StringBuilder(); + Formatter formatter = new Formatter(additionalHelp); + + // Get the list of walker names from the walker manager. + WalkerManager walkerManager = engine.getWalkerManager(); + + // Build a list sorted by walker display name. As this information is collected, keep track of the longest + // package / walker name for later formatting. + SortedSet helpText = new TreeSet(new HelpEntryComparator()); + + int longestPackageName = 0; + int longestWalkerName = 0; + for(Map.Entry>> walkersByPackage: walkerManager.getWalkerNamesByPackage(true).entrySet()) { + // Get the display name. + String packageName = walkersByPackage.getKey(); + String packageDisplayName = walkerManager.getPackageDisplayName(walkersByPackage.getKey()); + String packageHelpText = walkerManager.getPackageSummaryText(packageName); + + // Compute statistics about which names is longest. + longestPackageName = Math.max(longestPackageName,packageDisplayName.length()); + + SortedSet walkersInPackage = new TreeSet(new HelpEntryComparator()); + for(Class walkerType: walkersByPackage.getValue()) { + String walkerName = walkerType.getName(); + String walkerDisplayName = walkerManager.getName(walkerType); + String walkerHelpText = walkerManager.getWalkerSummaryText(walkerType); + + longestWalkerName = Math.max(longestWalkerName,walkerManager.getName(walkerType).length()); + + walkersInPackage.add(new HelpEntry(walkerName,walkerDisplayName,walkerHelpText)); + } + + // Dump the walkers into the sorted set. + helpText.add(new HelpEntry(packageName,packageDisplayName,packageHelpText,Collections.unmodifiableSortedSet(walkersInPackage))); + } + + final int headerWidth = Math.max(longestPackageName+PACKAGE_INDENT,longestWalkerName+WALKER_INDENT); + + + for(HelpEntry packageHelp: helpText) { + printDescriptorLine(formatter,PACKAGE_INDENT,packageHelp.displayName,headerWidth,FIELD_SEPARATOR,packageHelp.summary,TextFormattingUtils.DEFAULT_LINE_WIDTH); + + for(HelpEntry walkerHelp: packageHelp.children) + printDescriptorLine(formatter,WALKER_INDENT,walkerHelp.displayName,headerWidth,FIELD_SEPARATOR,walkerHelp.summary,TextFormattingUtils.DEFAULT_LINE_WIDTH); + + // Print a blank line between sets of walkers. + printDescriptorLine(formatter,0,"",headerWidth,FIELD_SEPARATOR,"", TextFormattingUtils.DEFAULT_LINE_WIDTH); + } + + return additionalHelp.toString(); + } + + private void printDescriptorLine(Formatter formatter, + int headerIndentWidth, + String header, + int headerWidth, + String fieldSeparator, + String description, + int lineWidth) { + final int headerPaddingWidth = headerWidth - header.length() - headerIndentWidth; + final int descriptionWidth = lineWidth - fieldSeparator.length() - headerWidth; + List wordWrappedText = TextFormattingUtils.wordWrap(description,descriptionWidth); + + String headerIndentFormatString = headerIndentWidth > 0 ? "%" + headerIndentWidth + "s" : "%s"; + String headerPaddingFormatString = headerPaddingWidth > 0 ? "%" + headerPaddingWidth + "s" : "%s"; + String headerWidthFormatString = headerWidth > 0 ? "%" + headerWidth + "s" : "%s"; + + // Output description line. + formatter.format(headerIndentFormatString + "%s" + headerPaddingFormatString + "%s%s%n", + "", header, "", fieldSeparator, wordWrappedText.size()>0?wordWrappedText.get(0):""); + for(int i = 1; i < wordWrappedText.size(); i++) + formatter.format(headerWidthFormatString + "%s%s%n", "", fieldSeparator, wordWrappedText.get(i)); + } + +} + +/** + * Represents a given help entry; contains a display name, a summary and optionally some children. + */ +class HelpEntry { + public final String uid; + public final String displayName; + public final String summary; + public final SortedSet children; + + /** + * Create a new help entry with the given display name, summary and children. + * @param uid a unique identifier. Usually, the java package. + * @param displayName display name for this help entry. + * @param summary summary for this help entry. + * @param children children for this help entry. + */ + public HelpEntry(String uid, String displayName, String summary, SortedSet children) { + this.uid = uid; + this.displayName = displayName; + this.summary = summary; + this.children = children; + } + + /** + * Create a new help entry with the given display name, summary and children. + * @param uid a unique identifier. Usually, the java package. + * @param displayName display name for this help entry. + * @param summary summary for this help entry. + */ + public HelpEntry(String uid, String displayName, String summary) { + this(uid,displayName,summary,null); + } + +} + +/** + * Compare two help entries by display name. + */ +class HelpEntryComparator implements Comparator { + private static TextFormattingUtils.CaseInsensitiveComparator textComparator = new TextFormattingUtils.CaseInsensitiveComparator(); + + /** + * Compares the order of lhs to rhs, not taking case into account. + * @param lhs First object to compare. + * @param rhs Second object to compare. + * @return 0 if objects are identical; -1 if lhs is before rhs, 1 if rhs is before lhs. Nulls are treated as after everything else. + */ + public int compare(HelpEntry lhs, HelpEntry rhs) { + if(lhs == null && rhs == null) return 0; + if(lhs == null || lhs.displayName.equals("")) return 1; + if(rhs == null || rhs.displayName.equals("")) return -1; + return lhs.displayName.equals(rhs.displayName) ? textComparator.compare(lhs.uid,rhs.uid) : textComparator.compare(lhs.displayName,rhs.displayName); + } + + +} \ No newline at end of file diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java new file mode 100644 index 000000000..8df294b21 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -0,0 +1,1240 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk; + +import com.google.java.contract.Ensures; +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.picard.reference.ReferenceSequenceFile; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMSequenceDictionary; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; +import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; +import org.broadinstitute.sting.gatk.datasources.reads.*; +import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; +import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; +import org.broadinstitute.sting.gatk.executive.MicroScheduler; +import org.broadinstitute.sting.gatk.filters.FilterManager; +import org.broadinstitute.sting.gatk.filters.ReadFilter; +import org.broadinstitute.sting.gatk.filters.ReadGroupBlackListFilter; +import org.broadinstitute.sting.gatk.io.OutputTracker; +import org.broadinstitute.sting.gatk.io.stubs.Stub; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; +import org.broadinstitute.sting.gatk.iterators.ReadTransformersMode; +import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; +import org.broadinstitute.sting.gatk.refdata.tracks.IndexDictionaryUtils; +import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; +import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; +import org.broadinstitute.sting.gatk.samples.SampleDB; +import org.broadinstitute.sting.gatk.samples.SampleDBBuilder; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.classloader.PluginManager; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.interval.IntervalUtils; +import org.broadinstitute.sting.utils.progressmeter.ProgressMeter; +import org.broadinstitute.sting.utils.recalibration.BQSRArgumentSet; +import org.broadinstitute.sting.utils.text.XReadLines; +import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.*; +import java.util.concurrent.TimeUnit; + +import static org.broadinstitute.sting.utils.DeprecatedToolChecks.getWalkerDeprecationInfo; +import static org.broadinstitute.sting.utils.DeprecatedToolChecks.isDeprecatedWalker; + +/** + * A GenomeAnalysisEngine that runs a specified walker. + */ +public class GenomeAnalysisEngine { + /** + * our log, which we want to capture anything from this class + */ + private static Logger logger = Logger.getLogger(GenomeAnalysisEngine.class); + public static final long NO_RUNTIME_LIMIT = -1; + + /** + * The GATK command-line argument parsing code. + */ + private ParsingEngine parsingEngine; + + /** + * The genomeLocParser can create and parse GenomeLocs. + */ + private GenomeLocParser genomeLocParser; + + /** + * Accessor for sharded read data. + */ + private SAMDataSource readsDataSource = null; + + /** + * Accessor for sharded reference data. + */ + private ReferenceDataSource referenceDataSource = null; + + /** + * Accessor for sample metadata + */ + private SampleDB sampleDB = null; + + /** + * Accessor for sharded reference-ordered data. + */ + private List rodDataSources; + + // our argument collection + private GATKArgumentCollection argCollection; + + /** + * Collection of intervals used by the engine. + */ + private GenomeLocSortedSet intervals = null; + + /** + * Explicitly assign the interval set to use for this traversal (for unit testing purposes) + * @param intervals set of intervals to use for this traversal + */ + public void setIntervals( GenomeLocSortedSet intervals ) { + this.intervals = intervals; + } + + /** + * Collection of inputs used by the engine. + */ + private Map inputs = new HashMap(); + + /** + * Collection of outputs used by the engine. + */ + private Collection> outputs = new ArrayList>(); + + /** + * Collection of the filters applied to the input data. + */ + private Collection filters; + + /** + * Collection of the read transformers applied to the reads + */ + private List readTransformers; + + /** + * Controls the allocation of threads between CPU vs IO. + */ + private ThreadAllocation threadAllocation; + + private ReadMetrics cumulativeMetrics = null; + + /** + * A currently hacky unique name for this GATK instance + */ + private String myName = "GATK_" + Math.abs(getRandomGenerator().nextInt()); + + /** + * our walker manager + */ + private final WalkerManager walkerManager = new WalkerManager(); + + private Walker walker; + + public void setWalker(Walker walker) { + this.walker = walker; + } + + /** + * The short name of the current GATK walker as a string + * @return a non-null String + */ + public String getWalkerName() { + return getWalkerName(walker.getClass()); + } + + /** + * A processed collection of SAM reader identifiers. + */ + private Collection samReaderIDs = Collections.emptyList(); + + /** + * Set the SAM/BAM files over which to traverse. + * @param samReaderIDs Collection of ids to use during this traversal. + */ + public void setSAMFileIDs(Collection samReaderIDs) { + this.samReaderIDs = samReaderIDs; + } + + /** + * Collection of reference metadata files over which to traverse. + */ + private Collection referenceMetaDataFiles; + + /** + * The threading efficiency monitor we use in the GATK to monitor our efficiency. + * + * May be null if one isn't active, or hasn't be initialized yet + */ + private ThreadEfficiencyMonitor threadEfficiencyMonitor = null; + + /** + * The global progress meter we are using to track our progress through the genome + */ + private ProgressMeter progressMeter = null; + + /** + * Set the reference metadata files to use for this traversal. + * @param referenceMetaDataFiles Collection of files and descriptors over which to traverse. + */ + public void setReferenceMetaDataFiles(Collection referenceMetaDataFiles) { + this.referenceMetaDataFiles = referenceMetaDataFiles; + } + + /** + * The maximum runtime of this engine, in nanoseconds, set during engine initialization + * from the GATKArgumentCollection command line value + */ + private long runtimeLimitInNanoseconds = -1; + + /** + * Static random number generator and seed. + */ + private static final long GATK_RANDOM_SEED = 47382911L; + private static Random randomGenerator = new Random(GATK_RANDOM_SEED); + public static Random getRandomGenerator() { return randomGenerator; } + public static void resetRandomGenerator() { randomGenerator.setSeed(GATK_RANDOM_SEED); } + public static void resetRandomGenerator(long seed) { randomGenerator.setSeed(seed); } + + /** + * Base Quality Score Recalibration helper object + */ + private BQSRArgumentSet bqsrArgumentSet = null; + public BQSRArgumentSet getBQSRArgumentSet() { return bqsrArgumentSet; } + public boolean hasBQSRArgumentSet() { return bqsrArgumentSet != null; } + public void setBaseRecalibration(final GATKArgumentCollection args) { + bqsrArgumentSet = new BQSRArgumentSet(args); + } + + /** + * Actually run the GATK with the specified walker. + * + * @return the value of this traversal. + */ + public Object execute() { + // first thing is to make sure the AWS keys can be decrypted + GATKRunReport.checkAWSAreValid(); + + //HeapSizeMonitor monitor = new HeapSizeMonitor(); + //monitor.start(); + setStartTime(new java.util.Date()); + + final GATKArgumentCollection args = this.getArguments(); + + // validate our parameters + if (args == null) { + throw new ReviewedStingException("The GATKArgumentCollection passed to GenomeAnalysisEngine can not be null."); + } + + // validate our parameters + if (this.walker == null) + throw new ReviewedStingException("The walker passed to GenomeAnalysisEngine can not be null."); + + if (args.nonDeterministicRandomSeed) + resetRandomGenerator(System.currentTimeMillis()); + + // if the use specified an input BQSR recalibration table then enable on the fly recalibration + if (args.BQSR_RECAL_FILE != null) + setBaseRecalibration(args); + + // setup the runtime limits + setupRuntimeLimits(args); + + // Determine how the threads should be divided between CPU vs. IO. + determineThreadAllocation(); + + // Prepare the data for traversal. + initializeDataSources(); + + // initialize and validate the interval list + initializeIntervals(); + validateSuppliedIntervals(); + + // check to make sure that all sequence dictionaries are compatible with the reference's sequence dictionary + validateDataSourcesAgainstReference(readsDataSource, referenceDataSource.getReference(), rodDataSources); + + // initialize sampleDB + initializeSampleDB(); + + // our microscheduler, which is in charge of running everything + MicroScheduler microScheduler = createMicroscheduler(); + threadEfficiencyMonitor = microScheduler.getThreadEfficiencyMonitor(); + + // create temp directories as necessary + initializeTempDirectory(); + + // create the output streams + initializeOutputStreams(microScheduler.getOutputTracker()); + + // Initializing the shard iterator / BAM schedule might take some time, so let the user know vaguely what's going on + logger.info("Preparing for traversal" + + (readsDataSource.getReaderIDs().size() > 0 ? String.format(" over %d BAM files", readsDataSource.getReaderIDs().size()) : "")); + Iterable shardStrategy = getShardStrategy(readsDataSource,microScheduler.getReference(),intervals); + logger.info("Done preparing for traversal"); + + // execute the microscheduler, storing the results + return microScheduler.execute(this.walker, shardStrategy); + + //monitor.stop(); + //logger.info(String.format("Maximum heap size consumed: %d",monitor.getMaxMemoryUsed())); + + //return result; + } + + /** + * Retrieves an instance of the walker based on the walker name. + * + * @param walkerName Name of the walker. Must not be null. If the walker cannot be instantiated, an exception will be thrown. + * @return An instance of the walker. + */ + public Walker getWalkerByName(String walkerName) { + try { + return walkerManager.createByName(walkerName); + } catch ( UserException e ) { + if ( isDeprecatedWalker(walkerName) ) { + e = new UserException.DeprecatedWalker(walkerName, getWalkerDeprecationInfo(walkerName)); + } + throw e; + } + } + + /** + * Gets the name of a given walker type. + * @param walkerType Type of walker. + * @return Name of the walker. + */ + public String getWalkerName(Class walkerType) { + return walkerManager.getName(walkerType); + } + + public String getName() { + return myName; + } + + /** + * Gets a list of the filters to associate with the given walker. Will NOT initialize the engine with this filters; + * the caller must handle that directly. + * @return A collection of available filters. + */ + public Collection createFilters() { + final List filters = new LinkedList<>(); + + // First add the user requested filters + if (this.getArguments().readGroupBlackList != null && this.getArguments().readGroupBlackList.size() > 0) + filters.add(new ReadGroupBlackListFilter(this.getArguments().readGroupBlackList)); + for(final String filterName: this.getArguments().readFilters) + filters.add(this.getFilterManager().createByName(filterName)); + + // now add the walker default filters. This ordering is critical important if + // users need to apply filters that fix up reads that would be removed by default walker filters + filters.addAll(WalkerManager.getReadFilters(walker,this.getFilterManager())); + + return Collections.unmodifiableList(filters); + } + + /** + * Returns a list of active, initialized read transformers + * + * @param walker the walker we need to apply read transformers too + */ + public void initializeReadTransformers(final Walker walker) { + // keep a list of the active read transformers sorted based on priority ordering + List activeTransformers = new ArrayList(); + + final ReadTransformersMode overrideMode = WalkerManager.getWalkerAnnotation(walker, ReadTransformersMode.class); + final ReadTransformer.ApplicationTime overrideTime = overrideMode != null ? overrideMode.ApplicationTime() : null; + + final PluginManager pluginManager = new PluginManager(ReadTransformer.class); + + for ( final ReadTransformer transformer : pluginManager.createAllTypes() ) { + transformer.initialize(overrideTime, this, walker); + if ( transformer.enabled() ) + activeTransformers.add(transformer); + } + + setReadTransformers(activeTransformers); + } + + public List getReadTransformers() { + return readTransformers; + } + + /* + * Sanity checks that incompatible read transformers are not active together (and throws an exception if they are). + * + * @param readTransformers the active read transformers + */ + protected void checkActiveReadTransformers(final List readTransformers) { + if ( readTransformers == null ) + throw new IllegalArgumentException("read transformers cannot be null"); + + ReadTransformer sawMustBeFirst = null; + ReadTransformer sawMustBeLast = null; + + for ( final ReadTransformer r : readTransformers ) { + if ( r.getOrderingConstraint() == ReadTransformer.OrderingConstraint.MUST_BE_FIRST ) { + if ( sawMustBeFirst != null ) + throw new UserException.IncompatibleReadFiltersException(sawMustBeFirst.toString(), r.toString()); + sawMustBeFirst = r; + } else if ( r.getOrderingConstraint() == ReadTransformer.OrderingConstraint.MUST_BE_LAST ) { + if ( sawMustBeLast != null ) + throw new UserException.IncompatibleReadFiltersException(sawMustBeLast.toString(), r.toString()); + sawMustBeLast = r; + } + } + } + + protected void setReadTransformers(final List readTransformers) { + if ( readTransformers == null ) + throw new ReviewedStingException("read transformers cannot be null"); + + // sort them in priority order + Collections.sort(readTransformers, new ReadTransformer.ReadTransformerComparator()); + + // make sure we don't have an invalid set of active read transformers + checkActiveReadTransformers(readTransformers); + + this.readTransformers = readTransformers; + } + + /** + * Parse out the thread allocation from the given command-line argument. + */ + private void determineThreadAllocation() { + if ( argCollection.numberOfDataThreads < 1 ) throw new UserException.BadArgumentValue("num_threads", "cannot be less than 1, but saw " + argCollection.numberOfDataThreads); + if ( argCollection.numberOfCPUThreadsPerDataThread < 1 ) throw new UserException.BadArgumentValue("num_cpu_threads", "cannot be less than 1, but saw " + argCollection.numberOfCPUThreadsPerDataThread); + if ( argCollection.numberOfIOThreads < 0 ) throw new UserException.BadArgumentValue("num_io_threads", "cannot be less than 0, but saw " + argCollection.numberOfIOThreads); + + this.threadAllocation = new ThreadAllocation(argCollection.numberOfDataThreads, + argCollection.numberOfCPUThreadsPerDataThread, + argCollection.numberOfIOThreads, + argCollection.monitorThreadEfficiency); + } + + public int getTotalNumberOfThreads() { + return this.threadAllocation == null ? 1 : threadAllocation.getTotalNumThreads(); + } + + + + /** + * Allow subclasses and others within this package direct access to the walker manager. + * @return The walker manager used by this package. + */ + protected WalkerManager getWalkerManager() { + return walkerManager; + } + + /** + * setup a microscheduler + * + * @return a new microscheduler + */ + private MicroScheduler createMicroscheduler() { + // Temporarily require all walkers to have a reference, even if that reference is not conceptually necessary. + if ((walker instanceof ReadWalker || walker instanceof DuplicateWalker || walker instanceof ReadPairWalker) && + this.getArguments().referenceFile == null) { + throw new UserException.CommandLineException("Read-based traversals require a reference file but none was given"); + } + + return MicroScheduler.create(this,walker,this.getReadsDataSource(),this.getReferenceDataSource().getReference(),this.getRodDataSources(),threadAllocation); + } + + protected DownsamplingMethod getDownsamplingMethod() { + GATKArgumentCollection argCollection = this.getArguments(); + + DownsamplingMethod commandLineMethod = argCollection.getDownsamplingMethod(); + DownsamplingMethod walkerMethod = WalkerManager.getDownsamplingMethod(walker); + + DownsamplingMethod method = commandLineMethod != null ? commandLineMethod : walkerMethod; + method.checkCompatibilityWithWalker(walker); + return method; + } + + protected void setDownsamplingMethod(DownsamplingMethod method) { + argCollection.setDownsamplingMethod(method); + } + + protected boolean includeReadsWithDeletionAtLoci() { + return walker.includeReadsWithDeletionAtLoci(); + } + + /** + * Verifies that the supplied set of reads files mesh with what the walker says it requires; + * also makes sure that list of SAM files specified on the command line is not empty and contains + * no duplicates. + */ + protected void validateSuppliedReads() { + GATKArgumentCollection arguments = this.getArguments(); + final Boolean samFilesArePresent = (arguments.samFiles != null && !arguments.samFiles.isEmpty()); + + // Check what the walker says is required against what was provided on the command line. + if (WalkerManager.isRequired(walker, DataSource.READS) && !samFilesArePresent) + throw new ArgumentException("Walker requires reads but none were provided."); + + // Check what the walker says is allowed against what was provided on the command line. + if (samFilesArePresent && !WalkerManager.isAllowed(walker, DataSource.READS)) + throw new ArgumentException("Walker does not allow reads but reads were provided."); + + //Make sure SAM list specified by the user (if necessary) is not empty + if(WalkerManager.isRequired(walker, DataSource.READS) && samFilesArePresent && samReaderIDs.isEmpty() ) { + throw new UserException("The list of input files does not contain any BAM files."); + } + + // Make sure no SAM files were specified multiple times by the user. + checkForDuplicateSamFiles(); + } + + /** + * Checks whether there are SAM files that appear multiple times in the fully unpacked list of + * SAM files (samReaderIDs). If there are, throws an ArgumentException listing the files in question. + */ + protected void checkForDuplicateSamFiles() { + Set encounteredSamFiles = new HashSet(); + Set duplicateSamFiles = new LinkedHashSet(); + + for ( SAMReaderID samFile : samReaderIDs ) { + if ( encounteredSamFiles.contains(samFile) ) { + duplicateSamFiles.add(samFile.getSamFilePath()); + } + else { + encounteredSamFiles.add(samFile); + } + } + + if ( duplicateSamFiles.size() > 0 ) { + throw new UserException("The following BAM files appear multiple times in the list of input files: " + + duplicateSamFiles + " BAM files may be specified at most once."); + } + + } + + /** + * Verifies that the supplied reference file mesh with what the walker says it requires. + */ + protected void validateSuppliedReference() { + GATKArgumentCollection arguments = this.getArguments(); + // Check what the walker says is required against what was provided on the command line. + // TODO: Temporarily disabling WalkerManager.isRequired check on the reference because the reference is always required. + if (/*WalkerManager.isRequired(walker, DataSource.REFERENCE) &&*/ arguments.referenceFile == null) + throw new ArgumentException("Walker requires a reference but none was provided."); + + // Check what the walker says is allowed against what was provided on the command line. + if (arguments.referenceFile != null && !WalkerManager.isAllowed(walker, DataSource.REFERENCE)) + throw new ArgumentException("Walker does not allow a reference but one was provided."); + } + + protected void validateSuppliedIntervals() { + // Only read walkers support '-L unmapped' intervals. Trap and validate any other instances of -L unmapped. + if(!(walker instanceof ReadWalker)) { + GenomeLocSortedSet intervals = getIntervals(); + if(intervals != null && getIntervals().contains(GenomeLoc.UNMAPPED)) + throw new ArgumentException("Interval list specifies unmapped region. Only read walkers may include the unmapped region."); + } + + // If intervals is non-null and empty at this point, it means that the list of intervals to process + // was filtered down to an empty set (eg., the user specified something like -L chr1 -XL chr1). Since + // this was very likely unintentional, the user should be informed of this. Note that this is different + // from the case where intervals == null, which indicates that there were no interval arguments. + if ( intervals != null && intervals.isEmpty() ) { + logger.warn("The given combination of -L and -XL options results in an empty set. No intervals to process."); + } + + // TODO: add a check for ActiveRegion walkers to prevent users from passing an entire contig/chromosome + } + + /** + * Get the sharding strategy given a driving data source. + * + * @param readsDataSource readsDataSource + * @param drivingDataSource Data on which to shard. + * @param intervals intervals + * @return the sharding strategy + */ + protected Iterable getShardStrategy(SAMDataSource readsDataSource, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) { + ValidationExclusion exclusions = (readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null); + DownsamplingMethod downsamplingMethod = readsDataSource != null ? readsDataSource.getReadsInfo().getDownsamplingMethod() : null; + ReferenceDataSource referenceDataSource = this.getReferenceDataSource(); + + // If reads are present, assume that accessing the reads is always the dominant factor and shard based on that supposition. + if(!readsDataSource.isEmpty()) { + if(!readsDataSource.hasIndex() && !exclusions.contains(ValidationExclusion.TYPE.ALLOW_UNINDEXED_BAM)) + throw new UserException.CommandLineException("Cannot process the provided BAM file(s) because they were not indexed. The GATK does offer limited processing of unindexed BAMs in --unsafe mode, but this GATK feature is currently unsupported."); + if(!readsDataSource.hasIndex() && intervals != null && !argCollection.allowIntervalsWithUnindexedBAM) + throw new UserException.CommandLineException("Cannot perform interval processing when reads are present but no index is available."); + + if(walker instanceof LocusWalker) { + if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) + throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); + if(intervals == null) + return readsDataSource.createShardIteratorOverMappedReads(new LocusShardBalancer()); + else + return readsDataSource.createShardIteratorOverIntervals(intervals,new LocusShardBalancer()); + } + else if(walker instanceof ActiveRegionWalker) { + if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) + throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Active region walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); + if(intervals == null) + return readsDataSource.createShardIteratorOverMappedReads(new ActiveRegionShardBalancer()); + else + return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), new ActiveRegionShardBalancer()); + } + else if(walker instanceof ReadWalker || walker instanceof ReadPairWalker || walker instanceof DuplicateWalker) { + // Apply special validation to read pair walkers. + if(walker instanceof ReadPairWalker) { + if(readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.queryname) + throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.queryname, "Read pair walkers are exceptions in that they cannot be run on coordinate-sorted BAMs but instead require query name-sorted files. You will need to resort your input BAM file in query name order to use this walker."); + if(intervals != null && !intervals.isEmpty()) + throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals."); + } + + if(intervals == null) + return readsDataSource.createShardIteratorOverAllReads(new ReadShardBalancer()); + else + return readsDataSource.createShardIteratorOverIntervals(intervals, new ReadShardBalancer()); + } + else + throw new ReviewedStingException("Unable to determine walker type for walker " + walker.getClass().getName()); + } + else { + // TODO -- Determine what the ideal shard size should be here. Matt suggested that a multiple of 16K might work well + // TODO -- (because of how VCF indexes work), but my empirical experience has been simply that the larger the shard + // TODO -- size the more efficient the traversal (at least for RODWalkers). Keeping the previous values for now. [EB] + final int SHARD_SIZE = walker instanceof RodWalker ? 1000000 : 100000; + if(intervals == null) + return referenceDataSource.createShardsOverEntireReference(readsDataSource,genomeLocParser,SHARD_SIZE); + else + return referenceDataSource.createShardsOverIntervals(readsDataSource,intervals,SHARD_SIZE); + } + } + + protected boolean flashbackData() { + return walker instanceof ReadWalker; + } + + /** + * Create the temp directory if it doesn't exist. + */ + private void initializeTempDirectory() { + File tempDir = new File(System.getProperty("java.io.tmpdir")); + if (!tempDir.exists() && !tempDir.mkdirs()) + throw new UserException.BadTmpDir("Unable to create directory"); + } + + /** + * Initialize the output streams as specified by the user. + * + * @param outputTracker the tracker supplying the initialization data. + */ + private void initializeOutputStreams(OutputTracker outputTracker) { + for (Map.Entry input : getInputs().entrySet()) + outputTracker.addInput(input.getKey(), input.getValue()); + for (Stub stub : getOutputs()) + outputTracker.addOutput(stub); + + outputTracker.prepareWalker(walker, getArguments().strictnessLevel); + } + + public ReferenceDataSource getReferenceDataSource() { + return referenceDataSource; + } + + public GenomeLocParser getGenomeLocParser() { + return genomeLocParser; + } + + /** + * Manage lists of filters. + */ + private final FilterManager filterManager = new FilterManager(); + + private Date startTime = null; // the start time for execution + + public void setParser(ParsingEngine parsingEngine) { + this.parsingEngine = parsingEngine; + } + + /** + * Explicitly set the GenomeLocParser, for unit testing. + * @param genomeLocParser GenomeLocParser to use. + */ + public void setGenomeLocParser(GenomeLocParser genomeLocParser) { + this.genomeLocParser = genomeLocParser; + } + + /** + * Sets the start time when the execute() function was last called + * @param startTime the start time when the execute() function was last called + */ + protected void setStartTime(Date startTime) { + this.startTime = startTime; + } + + /** + * @return the start time when the execute() function was last called + */ + public Date getStartTime() { + return startTime; + } + + /** + * Setup the intervals to be processed + */ + protected void initializeIntervals() { + intervals = IntervalUtils.parseIntervalArguments(this.referenceDataSource, argCollection.intervalArguments); + } + + /** + * Add additional, externally managed IO streams for inputs. + * + * @param argumentSource Field into which to inject the value. + * @param value Instance to inject. + */ + public void addInput(ArgumentSource argumentSource, Object value) { + inputs.put(argumentSource, value); + } + + /** + * Add additional, externally managed IO streams for output. + * + * @param stub Instance to inject. + */ + public void addOutput(Stub stub) { + outputs.add(stub); + } + + /** + * Returns the tag associated with a given command-line argument. + * @param key Object for which to inspect the tag. + * @return Tags object associated with the given key, or an empty Tag structure if none are present. + */ + public Tags getTags(Object key) { + return parsingEngine.getTags(key); + } + + protected void initializeDataSources() { + logger.info("Strictness is " + argCollection.strictnessLevel); + + validateSuppliedReference(); + setReferenceDataSource(argCollection.referenceFile); + + validateSuppliedReads(); + initializeReadTransformers(walker); + + readsDataSource = createReadsDataSource(argCollection,genomeLocParser,referenceDataSource.getReference()); + + for (ReadFilter filter : filters) + filter.initialize(this); + + // set the sequence dictionary of all of Tribble tracks to the sequence dictionary of our reference + rodDataSources = getReferenceOrderedDataSources(referenceMetaDataFiles,referenceDataSource.getReference().getSequenceDictionary(),genomeLocParser,argCollection.unsafe); + } + + /** + * Purely for testing purposes. Do not use unless you absolutely positively know what you are doing (or + * need to absolutely positively kill everyone in the room) + * @param dataSource + */ + public void setReadsDataSource(final SAMDataSource dataSource) { + this.readsDataSource = dataSource; + } + + /** + * Entry-point function to initialize the samples database from input data and pedigree arguments + */ + private void initializeSampleDB() { + SampleDBBuilder sampleDBBuilder = new SampleDBBuilder(this, argCollection.pedigreeValidationType); + sampleDBBuilder.addSamplesFromSAMHeader(getSAMFileHeader()); + sampleDBBuilder.addSamplesFromSampleNames(SampleUtils.getUniqueSamplesFromRods(this)); + sampleDBBuilder.addSamplesFromPedigreeFiles(argCollection.pedigreeFiles); + sampleDBBuilder.addSamplesFromPedigreeStrings(argCollection.pedigreeStrings); + sampleDB = sampleDBBuilder.getFinalSampleDB(); + } + + /** + * Gets a unique identifier for the reader sourcing this read. + * @param read Read to examine. + * @return A unique identifier for the source file of this read. Exception if not found. + */ + public SAMReaderID getReaderIDForRead(final SAMRecord read) { + return getReadsDataSource().getReaderID(read); + } + + /** + * Gets the source file for this read. + * @param id Unique identifier determining which input file to use. + * @return The source filename for this read. + */ + public File getSourceFileForReaderID(final SAMReaderID id) { + return getReadsDataSource().getSAMFile(id); + } + + /** + * Now that all files are open, validate the sequence dictionaries of the reads vs. the reference vrs the reference ordered data (if available). + * + * @param reads Reads data source. + * @param reference Reference data source. + * @param rods a collection of the reference ordered data tracks + */ + private void validateDataSourcesAgainstReference(SAMDataSource reads, ReferenceSequenceFile reference, Collection rods) { + if ((reads.isEmpty() && (rods == null || rods.isEmpty())) || reference == null ) + return; + + // Compile a set of sequence names that exist in the reference file. + SAMSequenceDictionary referenceDictionary = reference.getSequenceDictionary(); + + if (!reads.isEmpty()) { + // Compile a set of sequence names that exist in the BAM files. + SAMSequenceDictionary readsDictionary = reads.getHeader().getSequenceDictionary(); + + if (readsDictionary.size() == 0) { + logger.info("Reads file is unmapped. Skipping validation against reference."); + return; + } + + // compare the reads to the reference + SequenceDictionaryUtils.validateDictionaries(logger, getArguments().unsafe, "reads", readsDictionary, + "reference", referenceDictionary, true, intervals); + } + + for (ReferenceOrderedDataSource rod : rods) + IndexDictionaryUtils.validateTrackSequenceDictionary(rod.getName(), rod.getSequenceDictionary(), referenceDictionary, getArguments().unsafe); + } + + /** + * Gets a data source for the given set of reads. + * + * @param argCollection arguments + * @param genomeLocParser parser + * @param refReader reader + * @return A data source for the given set of reads. + */ + private SAMDataSource createReadsDataSource(GATKArgumentCollection argCollection, GenomeLocParser genomeLocParser, IndexedFastaSequenceFile refReader) { + DownsamplingMethod downsamplingMethod = getDownsamplingMethod(); + + // Synchronize the method back into the collection so that it shows up when + // interrogating for the downsampling method during command line recreation. + setDownsamplingMethod(downsamplingMethod); + + logger.info(downsamplingMethod); + + if (argCollection.removeProgramRecords && argCollection.keepProgramRecords) + throw new UserException.BadArgumentValue("rpr / kpr", "Cannot enable both options"); + + boolean removeProgramRecords = argCollection.removeProgramRecords || walker.getClass().isAnnotationPresent(RemoveProgramRecords.class); + + if (argCollection.keepProgramRecords) + removeProgramRecords = false; + + final boolean keepReadsInLIBS = walker instanceof ActiveRegionWalker; + + final Map sampleRenameMap = argCollection.sampleRenameMappingFile != null ? + loadSampleRenameMap(argCollection.sampleRenameMappingFile) : + null; + + return new SAMDataSource( + samReaderIDs, + threadAllocation, + argCollection.numberOfBAMFileHandles, + genomeLocParser, + argCollection.useOriginalBaseQualities, + argCollection.strictnessLevel, + argCollection.readBufferSize, + downsamplingMethod, + new ValidationExclusion(Arrays.asList(argCollection.unsafe)), + filters, + readTransformers, + includeReadsWithDeletionAtLoci(), + argCollection.defaultBaseQualities, + removeProgramRecords, + keepReadsInLIBS, + sampleRenameMap); + } + + /** + * Loads a user-provided sample rename map file for use in on-the-fly sample renaming into an in-memory + * HashMap. This file must consist of lines with two whitespace-separated fields: + * + * absolute_path_to_bam_file new_sample_name + * + * The engine will verify that each bam file contains reads from only one sample when the on-the-fly sample + * renaming feature is being used. + * + * @param sampleRenameMapFile sample rename map file from which to load data + * @return a HashMap containing the contents of the map file, with the keys being the bam file paths and + * the values being the new sample names. + */ + protected Map loadSampleRenameMap( final File sampleRenameMapFile ) { + logger.info("Renaming samples from BAM files on-the-fly using mapping file " + sampleRenameMapFile.getAbsolutePath()); + + final Map sampleRenameMap = new HashMap<>((int)sampleRenameMapFile.length() / 50); + + try { + for ( final String line : new XReadLines(sampleRenameMapFile) ) { + final String[] tokens = line.split("\\s+"); + + if ( tokens.length != 2 ) { + throw new UserException.MalformedFile(sampleRenameMapFile, + String.format("Encountered a line with %s fields instead of the required 2 fields. Line was: %s", + tokens.length, line)); + } + + final File bamFile = new File(tokens[0]); + final String newSampleName = tokens[1]; + + if ( ! bamFile.isAbsolute() ) { + throw new UserException.MalformedFile(sampleRenameMapFile, "Bam file path not absolute at line: " + line); + } + + final SAMReaderID bamID = new SAMReaderID(bamFile, new Tags()); + + if ( sampleRenameMap.containsKey(bamID) ) { + throw new UserException.MalformedFile(sampleRenameMapFile, + String.format("Bam file %s appears more than once", bamFile.getAbsolutePath())); + } + + sampleRenameMap.put(bamID, newSampleName); + } + } + catch ( FileNotFoundException e ) { + throw new UserException.CouldNotReadInputFile(sampleRenameMapFile, e); + } + + return sampleRenameMap; + } + + + /** + * Opens a reference sequence file paired with an index. Only public for testing purposes + * + * @param refFile Handle to a reference sequence file. Non-null. + */ + public void setReferenceDataSource(File refFile) { + this.referenceDataSource = new ReferenceDataSource(refFile); + genomeLocParser = new GenomeLocParser(referenceDataSource.getReference()); + } + + /** + * Open the reference-ordered data sources. + * + * @param referenceMetaDataFiles collection of RMD descriptors to load and validate. + * @param sequenceDictionary GATK-wide sequnce dictionary to use for validation. + * @param genomeLocParser to use when creating and validating GenomeLocs. + * @param validationExclusionType potentially indicate which validations to include / exclude. + * + * @return A list of reference-ordered data sources. + */ + private List getReferenceOrderedDataSources(Collection referenceMetaDataFiles, + SAMSequenceDictionary sequenceDictionary, + GenomeLocParser genomeLocParser, + ValidationExclusion.TYPE validationExclusionType) { + final RMDTrackBuilder builder = new RMDTrackBuilder(sequenceDictionary,genomeLocParser, validationExclusionType, + getArguments().disableAutoIndexCreationAndLockingWhenReadingRods); + + final List dataSources = new ArrayList(); + for (RMDTriplet fileDescriptor : referenceMetaDataFiles) + dataSources.add(new ReferenceOrderedDataSource(fileDescriptor, + builder, + sequenceDictionary, + genomeLocParser, + flashbackData())); + + return dataSources; + } + + /** + * Returns the SAM File Header from the input reads' data source file + * @return the SAM File Header from the input reads' data source file + */ + public SAMFileHeader getSAMFileHeader() { + return readsDataSource.getHeader(); + } + + public boolean lenientVCFProcessing() { + return lenientVCFProcessing(argCollection.unsafe); + } + + public static boolean lenientVCFProcessing(final ValidationExclusion.TYPE val) { + return val == ValidationExclusion.TYPE.ALL + || val == ValidationExclusion.TYPE.LENIENT_VCF_PROCESSING; + } + + /** + * Returns the unmerged SAM file header for an individual reader. + * @param reader The reader. + * @return Header for that reader or null if not available. + */ + public SAMFileHeader getSAMFileHeader(SAMReaderID reader) { + return readsDataSource == null ? null : readsDataSource.getHeader(reader); + } + + /** + * Returns an ordered list of the unmerged SAM file headers known to this engine. + * @return list of header for each input SAM file, in command line order + */ + public List getSAMFileHeaders() { + final List headers = new ArrayList(); + for ( final SAMReaderID id : getReadsDataSource().getReaderIDs() ) { + headers.add(getReadsDataSource().getHeader(id)); + } + return headers; + } + + /** + * Gets the master sequence dictionary for this GATK engine instance + * @return a never-null dictionary listing all of the contigs known to this engine instance + */ + public SAMSequenceDictionary getMasterSequenceDictionary() { + return getReferenceDataSource().getReference().getSequenceDictionary(); + } + + /** + * Returns data source object encapsulating all essential info and handlers used to traverse + * reads; header merger, individual file readers etc can be accessed through the returned data source object. + * + * @return the reads data source + */ + public SAMDataSource getReadsDataSource() { + return this.readsDataSource; + } + + /** + * Sets the collection of GATK main application arguments. + * + * @param argCollection the GATK argument collection + */ + public void setArguments(GATKArgumentCollection argCollection) { + this.argCollection = argCollection; + } + + /** + * Gets the collection of GATK main application arguments. + * + * @return the GATK argument collection + */ + public GATKArgumentCollection getArguments() { + return this.argCollection; + } + + /** + * Get the list of intervals passed to the engine. + * @return List of intervals, or null if no intervals are in use + */ + public GenomeLocSortedSet getIntervals() { + return this.intervals; + } + + /** + * Get the list of regions of the genome being processed. If the user + * requested specific intervals, return those, otherwise return regions + * corresponding to the entire genome. Never returns null. + * + * @return a non-null set of intervals being processed + */ + @Ensures("result != null") + public GenomeLocSortedSet getRegionsOfGenomeBeingProcessed() { + if ( getIntervals() == null ) + // if we don't have any intervals defined, create intervals from the reference itself + return GenomeLocSortedSet.createSetFromSequenceDictionary(getReferenceDataSource().getReference().getSequenceDictionary()); + else + return getIntervals(); + } + + /** + * Gets the list of filters employed by this engine. + * @return Collection of filters (actual instances) used by this engine. + */ + public Collection getFilters() { + return this.filters; + } + + /** + * Sets the list of filters employed by this engine. + * @param filters Collection of filters (actual instances) used by this engine. + */ + public void setFilters(Collection filters) { + this.filters = filters; + } + + /** + * Gets the filter manager for this engine. + * @return filter manager for this engine. + */ + protected FilterManager getFilterManager() { + return filterManager; + } + + /** + * Gets the input sources for this engine. + * @return input sources for this engine. + */ + protected Map getInputs() { + return inputs; + } + + /** + * Gets the output stubs for this engine. + * @return output stubs for this engine. + */ + protected Collection> getOutputs() { + return outputs; + } + + /** + * Returns data source objects encapsulating all rod data; + * individual rods can be accessed through the returned data source objects. + * + * @return the rods data sources + */ + public List getRodDataSources() { + return this.rodDataSources; + } + + /** + * Gets cumulative metrics about the entire run to this point. + * Returns a clone of this snapshot in time. + * @return cumulative metrics about the entire run at this point. ReadMetrics object is a unique instance and is + * owned by the caller; the caller can do with the object what they wish. + */ + public ReadMetrics getCumulativeMetrics() { + // todo -- probably shouldn't be lazy + if ( cumulativeMetrics == null ) + cumulativeMetrics = readsDataSource == null ? new ReadMetrics() : readsDataSource.getCumulativeReadMetrics(); + return cumulativeMetrics; + } + + /** + * Return the global ThreadEfficiencyMonitor, if there is one + * + * @return the monitor, or null if none is active + */ + public ThreadEfficiencyMonitor getThreadEfficiencyMonitor() { + return threadEfficiencyMonitor; + } + + // ------------------------------------------------------------------------------------- + // + // code for working with Samples database + // + // ------------------------------------------------------------------------------------- + + public SampleDB getSampleDB() { + return this.sampleDB; + } + + public Map getApproximateCommandLineArguments(Object... argumentProviders) { + return CommandLineUtils.getApproximateCommandLineArguments(parsingEngine,argumentProviders); + } + + public String createApproximateCommandLineArgumentString(Object... argumentProviders) { + return CommandLineUtils.createApproximateCommandLineArgumentString(parsingEngine,argumentProviders); + } + + // ------------------------------------------------------------------------------------- + // + // code for working with progress meter + // + // ------------------------------------------------------------------------------------- + + /** + * Register the global progress meter with this engine + * + * Calling this function more than once will result in an IllegalStateException + * + * @param meter a non-null progress meter + */ + public void registerProgressMeter(final ProgressMeter meter) { + if ( meter == null ) throw new IllegalArgumentException("Meter cannot be null"); + if ( progressMeter != null ) throw new IllegalStateException("Progress meter already set"); + + progressMeter = meter; + } + + /** + * Get the progress meter being used by this engine. May be null if no meter has been registered yet + * @return a potentially null pointer to the progress meter + */ + public ProgressMeter getProgressMeter() { + return progressMeter; + } + + /** + * Does the current runtime in unit exceed the runtime limit, if one has been provided? + * + * @return false if not limit was requested or if runtime <= the limit, true otherwise + */ + public boolean exceedsRuntimeLimit() { + if ( progressMeter == null ) + // not yet initialized or not set because of testing + return false; + + if ( getArguments().maxRuntime == NO_RUNTIME_LIMIT ) + return false; + else { + final long runtime = progressMeter.getRuntimeInNanosecondsUpdatedPeriodically(); + if ( runtime < 0 ) throw new IllegalArgumentException("runtime must be >= 0 but got " + runtime); + final long maxRuntimeNano = getRuntimeLimitInNanoseconds(); + return runtime > maxRuntimeNano; + } + } + + /** + * @return the runtime limit in nanoseconds, or -1 if no limit was specified + */ + public long getRuntimeLimitInNanoseconds() { + return runtimeLimitInNanoseconds; + } + + /** + * Setup the runtime limits for this engine, updating the runtimeLimitInNanoseconds + * as appropriate + * + * @param args the GATKArgumentCollection to retrieve our runtime limits from + */ + private void setupRuntimeLimits(final GATKArgumentCollection args) { + if ( args.maxRuntime == NO_RUNTIME_LIMIT ) + runtimeLimitInNanoseconds = -1; + else if (args.maxRuntime < 0 ) + throw new UserException.BadArgumentValue("maxRuntime", "must be >= 0 or == -1 (meaning no limit) but received negative value " + args.maxRuntime); + else { + runtimeLimitInNanoseconds = TimeUnit.NANOSECONDS.convert(args.maxRuntime, args.maxRuntimeUnits); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/ReadMetrics.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/ReadMetrics.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/ReadMetrics.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/ReadMetrics.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/ReadProperties.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/ReadProperties.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/WalkerManager.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/WalkerManager.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/DbsnpArgumentCollection.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/arguments/DbsnpArgumentCollection.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/arguments/DbsnpArgumentCollection.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/arguments/DbsnpArgumentCollection.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java new file mode 100644 index 000000000..e86780eb4 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -0,0 +1,571 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.arguments; + +import net.sf.samtools.SAMFileReader; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; +import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; +import org.broadinstitute.sting.gatk.samples.PedigreeValidationType; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.baq.BAQ; +import org.broadinstitute.sting.utils.variant.GATKVCFIndexType; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; + +import java.io.File; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.TimeUnit; + +/** + * @author aaron + * @version 1.0 + */ +public class GATKArgumentCollection { + + /** the constructor */ + public GATKArgumentCollection() { + } + + // parameters and their defaults + /** + * An input file containing sequence data mapped to a reference, in SAM or BAM format, or a text file containing a + * list of input files (with extension .list). Note that the GATK requires an accompanying index for each SAM or + * BAM file. Please see our online documentation for more details on input formatting requirements. + */ + @Input(fullName = "input_file", shortName = "I", doc = "Input file containing sequence data (SAM or BAM)", required = false) + public List samFiles = new ArrayList(); + + @Hidden + @Argument(fullName = "showFullBamList",doc="Emit a log entry (level INFO) containing the full list of sequence data files to be included in the analysis (including files inside .bam.list files).") + public Boolean showFullBamList = false; + + @Argument(fullName = "read_buffer_size", shortName = "rbs", doc="Number of reads per SAM file to buffer in memory", required = false, minValue = 0) + public Integer readBufferSize = null; + + // -------------------------------------------------------------------------------------------------------------- + // + // GATKRunReport options + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * By default, GATK generates a run report that is uploaded to a cloud-based service. This report contains basic + * non-identifying statistics (which tool was used, whether the run was successful etc.) that help us for debugging + * and development. You can use this option to turn off reporting if your run environment is not connected to the + * internet or if your data is subject to stringent confidentiality clauses (e.g. clinical patient data). + * To do so you will need to request a key using the online request form on our website. + */ + @Argument(fullName = "phone_home", shortName = "et", doc="Run reporting mode", required = false) + public GATKRunReport.PhoneHomeOption phoneHomeType = GATKRunReport.PhoneHomeOption.AWS; + /** + * Please see the online documentation FAQs for more details on the key system and how to request a key. + */ + @Argument(fullName = "gatk_key", shortName = "K", doc="GATK key file required to run with -et NO_ET", required = false) + public File gatkKeyFile = null; + + /** + * The GATKRunReport supports (as of GATK 2.2) tagging GATK runs with an arbitrary tag that can be + * used to group together runs during later analysis. One use of this capability is to tag runs as GATK + * performance tests, so that the performance of the GATK over time can be assessed from the logs directly. + * + * Note that the tags do not conform to any ontology, so you are free to use any tags that you might find + * meaningful. + */ + @Argument(fullName = "tag", shortName = "tag", doc="Tag to identify this GATK run as part of a group of runs", required = false) + public String tag = "NA"; + + // -------------------------------------------------------------------------------------------------------------- + // + // General features + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * Reads that fail the specified filters will not be used in the analysis. Multiple filters can be specified separately, + * e.g. you can do -rf MalformedRead -rf BadCigar and so on. Available read filters are listed in the online tool + * documentation. Note that the read name format is e.g. MalformedReadFilter, but at the command line the filter + * name should be given without the Filter suffix; e.g. -rf MalformedRead (NOT -rf MalformedReadFilter, which is not + * recognized by the program). Note also that some read filters are applied by default for some analysis tools; this + * is specified in each tool's documentation. The default filters cannot be disabled. + */ + @Argument(fullName = "read_filter", shortName = "rf", doc = "Filters to apply to reads before analysis", required = false) + public final List readFilters = new ArrayList(); + + @ArgumentCollection + public IntervalArgumentCollection intervalArguments = new IntervalArgumentCollection(); + /** + * The reference genome against which the sequence data was mapped. The GATK requires an index file and a dictionary + * file accompanying the reference (please see the online documentation FAQs for more details on these files). Although + * this argument is indicated as being optional, almost all GATK tools require a reference in order to run. + * Note also that while GATK can in theory process genomes from any organism with any number of chromosomes or contigs, + * it is not designed to process draft genome assemblies and performance will decrease as the number of contigs in + * the reference increases. We strongly discourage the use of unfinished genome assemblies containing more than a few + * hundred contigs. Contig numbers in the thousands will most probably cause memory-related crashes. + */ + @Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = false) + public File referenceFile = null; + /** + * If this flag is enabled, the random numbers generated will be different in every run, causing GATK to behave non-deterministically. + */ + @Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Use a non-deterministic random seed", required = false) + public boolean nonDeterministicRandomSeed = false; + /** + * To be used in the testing framework where dynamic parallelism can result in differing numbers of calls to the random generator. + */ + @Hidden + @Argument(fullName = "disableDithering",doc="Completely eliminates randomized dithering from rank sum tests.") + public boolean disableDithering = false; + /** + * This will truncate the run but without exiting with a failure. By default the value is interpreted in minutes, but this can be changed with the maxRuntimeUnits argument. + */ + @Argument(fullName = "maxRuntime", shortName = "maxRuntime", doc="Stop execution cleanly as soon as maxRuntime has been reached", required = false) + public long maxRuntime = GenomeAnalysisEngine.NO_RUNTIME_LIMIT; + + @Argument(fullName = "maxRuntimeUnits", shortName = "maxRuntimeUnits", doc="Unit of time used by maxRuntime", required = false) + public TimeUnit maxRuntimeUnits = TimeUnit.MINUTES; + + // -------------------------------------------------------------------------------------------------------------- + // + // Downsampling Arguments + // + // -------------------------------------------------------------------------------------------------------------- + /** + * There are several ways to downsample reads, i.e. to removed reads from the pile of reads that will be used for analysis. + * See the documentation of the individual downsampling options for details on how they work. Note that Many GATK tools + * specify a default downsampling type and target, but this behavior can be overridden from command line using the + * downsampling arguments. + */ + @Argument(fullName = "downsampling_type", shortName="dt", doc="Type of read downsampling to employ at a given locus", required = false) + public DownsampleType downsamplingType = null; + /** + * Reads will be downsampled so the specified fraction remains; e.g. if you specify -dfrac 0.25, three-quarters of + * the reads will be removed, and the remaining one quarter will be used in the analysis. This method of downsampling + * is truly unbiased and random. It is typically used to simulate the effect of generating different amounts of + * sequence data for a given sample. For example, you can use this in a pilot experiment to evaluate how much target + * coverage you need to aim for in order to obtain enough coverage in all loci of interest. + */ + @Argument(fullName = "downsample_to_fraction", shortName = "dfrac", doc = "Fraction of reads to downsample to", required = false, minValue = 0.0, maxValue = 1.0) + public Double downsampleFraction = null; + + /** + * The principle of this downsampling type is to downsample reads to a given capping threshold coverage. Its purpose is to + * get rid of excessive coverage, because above a certain depth, having additional data is not informative and imposes + * unreasonable computational costs. The downsampling process takes two different forms depending on the type of + * analysis it is used with. + * + * For locus-based traversals (LocusWalkers like UnifiedGenotyper and ActiveRegionWalkers like HaplotypeCaller), + * downsample_to_coverage controls the maximum depth of coverage at each locus. For read-based traversals + * (ReadWalkers like BaseRecalibrator), it controls the maximum number of reads sharing the same alignment start + * position. For ReadWalkers you will typically need to use much lower dcov values than you would with LocusWalkers + * to see an effect. Note that this downsampling option does not produce an unbiased random sampling from all available + * reads at each locus: instead, the primary goal of the to-coverage downsampler is to maintain an even representation + * of reads from all alignment start positions when removing excess coverage. For a truly unbiased random sampling of + * reads, use -dfrac instead. Also note that the coverage target is an approximate goal that is not guaranteed to be + * met exactly: the downsampling algorithm will under some circumstances retain slightly more or less coverage than + * requested. + */ + @Argument(fullName = "downsample_to_coverage", shortName = "dcov", + doc = "Target coverage threshold for downsampling to coverage", + required = false, minValue = 0) + public Integer downsampleCoverage = null; + + /** + * Gets the downsampling method explicitly specified by the user. If the user didn't specify + * a default downsampling mechanism, return the default. + * @return The explicitly specified downsampling mechanism, or the default if none exists. + */ + public DownsamplingMethod getDownsamplingMethod() { + if ( downsamplingType == null && downsampleFraction == null && downsampleCoverage == null ) + return null; + + return new DownsamplingMethod(downsamplingType, downsampleCoverage, downsampleFraction); + } + + /** + * Set the downsampling method stored in the argument collection so that it is read back out when interrogating the command line arguments. + * @param method The downsampling mechanism. + */ + public void setDownsamplingMethod(DownsamplingMethod method) { + if (method == null) + throw new IllegalArgumentException("method is null"); + + downsamplingType = method.type; + downsampleCoverage = method.toCoverage; + downsampleFraction = method.toFraction; + } + + // -------------------------------------------------------------------------------------------------------------- + // + // BAQ arguments + // + // -------------------------------------------------------------------------------------------------------------- + @Argument(fullName = "baq", shortName="baq", doc="Type of BAQ calculation to apply in the engine", required = false) + public BAQ.CalculationMode BAQMode = BAQ.CalculationMode.OFF; + /** + * Phred-scaled gap open penalty for BAQ calculation. Although the default value is 40, a value of 30 may be better for whole genome call sets. + */ + @Argument(fullName = "baqGapOpenPenalty", shortName="baqGOP", doc="BAQ gap open penalty", required = false, minValue = 0) + public double BAQGOP = BAQ.DEFAULT_GOP; + + // -------------------------------------------------------------------------------------------------------------- + // + // quality encoding checking arguments + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * By default the GATK assumes that base quality scores start at Q0 == ASCII 33 according to the SAM specification. + * However, encoding in some datasets (especially older Illumina ones) starts at Q64. This argument will fix the + * encodings on the fly (as the data is read in) by subtracting 31 from every quality score. Note that this argument should + * NEVER be used by default; you should only use it when you have confirmed that the quality scores in your data are + * not in the correct encoding. + */ + @Argument(fullName = "fix_misencoded_quality_scores", shortName="fixMisencodedQuals", doc="Fix mis-encoded base quality scores", required = false) + public boolean FIX_MISENCODED_QUALS = false; + /** + * This flag tells GATK to ignore warnings when encountering base qualities that are too high and that seemingly + * indicate a problem with the base quality encoding of the BAM file. You should only use this if you really know + * what you are doing; otherwise you could seriously mess up your data and ruin your analysis. + */ + @Argument(fullName = "allow_potentially_misencoded_quality_scores", shortName="allowPotentiallyMisencodedQuals", doc="Ignore warnings about base quality score encoding", required = false) + public boolean ALLOW_POTENTIALLY_MISENCODED_QUALS = false; + /** + * This flag tells GATK to use the original base qualities (that were in the data before BQSR/recalibration) which + * are stored in the OQ tag, if they are present, rather than use the post-recalibration quality scores. If no OQ + * tag is present for a read, the standard qual score will be used. + */ + @Argument(fullName="useOriginalQualities", shortName = "OQ", doc = "Use the base quality scores from the OQ tag", required=false) + public Boolean useOriginalBaseQualities = false; + /** + * If reads are missing some or all base quality scores, this value will be used for all base quality scores. + * By default this is set to -1 to disable default base quality assignment. + */ + @Argument(fullName="defaultBaseQualities", shortName = "DBQ", doc = "Assign a default base quality", required=false, minValue = 0, maxValue = Byte.MAX_VALUE) + public byte defaultBaseQualities = -1; + + // -------------------------------------------------------------------------------------------------------------- + // + // performance log arguments + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * The file name for the GATK performance log output, or null if you don't want to generate the + * detailed performance logging table. This table is suitable for importing into R or any + * other analysis software that can read tsv files. + */ + @Argument(fullName = "performanceLog", shortName="PF", doc="Write GATK runtime performance log to this file", required = false) + public File performanceLog = null; + + // -------------------------------------------------------------------------------------------------------------- + // + // BQSR arguments + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * Enables on-the-fly recalibrate of base qualities, intended primarily for use with BaseRecalibrator and PrintReads + * (see Best Practices workflow documentation). The covariates tables are produced by the BaseRecalibrator tool. + * Please be aware that you should only run recalibration with the covariates file created on the same input bam(s). + */ + @Input(fullName="BQSR", shortName="BQSR", required=false, doc="Input covariates table file for on-the-fly base quality score recalibration") + public File BQSR_RECAL_FILE = null; + + /** + * Turns on the base quantization module. It requires a recalibration report (-BQSR). + * + * A value of 0 here means "do not quantize". + * Any value greater than zero will be used to recalculate the quantization using that many levels. + * Negative values mean that we should quantize using the recalibration report's quantization level. + */ + @Hidden + @Argument(fullName="quantize_quals", shortName = "qq", doc = "Quantize quality scores to a given number of levels (with -BQSR)", required=false) + public int quantizationLevels = 0; + + /** + * Turns off printing of the base insertion and base deletion tags when using the -BQSR argument. Only the base substitution qualities will be produced. + */ + @Argument(fullName="disable_indel_quals", shortName = "DIQ", doc = "Disable printing of base insertion and deletion tags (with -BQSR)", required=false) + public boolean disableIndelQuals = false; + + /** + * By default, the OQ tag in not emitted when using the -BQSR argument. Use this flag to include OQ tags in the output BAM file. + * Note that this may results in significant file size increase. + */ + @Argument(fullName="emit_original_quals", shortName = "EOQ", doc = "Emit the OQ tag with the original base qualities (with -BQSR)", required=false) + public boolean emitOriginalQuals = false; + + /** + * This flag tells GATK not to modify quality scores less than this value. Instead they will be written out unmodified in the recalibrated BAM file. + * In general it's unsafe to change qualities scores below < 6, since base callers use these values to indicate random or bad bases. + * For example, Illumina writes Q2 bases when the machine has really gone wrong. This would be fine in and of itself, + * but when you select a subset of these reads based on their ability to align to the reference and their dinucleotide effect, + * your Q2 bin can be elevated to Q8 or Q10, leading to issues downstream. + */ + @Argument(fullName = "preserve_qscores_less_than", shortName = "preserveQ", doc = "Don't recalibrate bases with quality scores less than this threshold (with -BQSR)", required = false, minValue = 0, minRecommendedValue = QualityUtils.MIN_USABLE_Q_SCORE) + public int PRESERVE_QSCORES_LESS_THAN = QualityUtils.MIN_USABLE_Q_SCORE; + /** + * If specified, this value will be used as the prior for all mismatch quality scores instead of the actual reported quality score. + */ + @Argument(fullName = "globalQScorePrior", shortName = "globalQScorePrior", doc = "Global Qscore Bayesian prior to use for BQSR", required = false) + public double globalQScorePrior = -1.0; + + /** + * It is absolutely not recommended practice to run base quality score recalibration on BAM files that have been + * processed with ReduceReads. By default, the GATK will error out if it detects that you are trying to recalibrate + * a reduced BAM file. However, this flag allows you to disable the warning and proceed anyway. For the sake of your + * data, please only use this option if you really know what you are doing. + */ + @Advanced + @Argument(fullName = "allow_bqsr_on_reduced_bams_despite_repeated_warnings", shortName="allowBqsrOnReducedBams", doc="Ignore all warnings about how it's a really bad idea to run BQSR on a reduced BAM file (AT YOUR OWN RISK!)", required = false) + public boolean ALLOW_BQSR_ON_REDUCED_BAMS = false; + + // -------------------------------------------------------------------------------------------------------------- + // + // Other utility arguments + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * Keep in mind that if you set this to LENIENT, we may refuse to provide you with support if anything goes wrong. + */ + @Argument(fullName = "validation_strictness", shortName = "S", doc = "How strict should we be with validation", required = false) + public SAMFileReader.ValidationStringency strictnessLevel = SAMFileReader.ValidationStringency.SILENT; + /** + * Some tools keep program records in the SAM header by default. Use this argument to override that behavior and discard program records for the SAM header. + */ + @Argument(fullName = "remove_program_records", shortName = "rpr", doc = "Remove program records from the SAM header", required = false) + public boolean removeProgramRecords = false; + /** + * Some tools discard program records from the SAM header by default. Use this argument to override that behavior and keep program records in the SAM header. + */ + @Argument(fullName = "keep_program_records", shortName = "kpr", doc = "Keep program records in the SAM header", required = false) + public boolean keepProgramRecords = false; + /** + * This option requires that each BAM file listed in the mapping file have only a single sample specified in its header + * (though there may be multiple read groups for that sample). Each line of the mapping file must contain the absolute + * path to a BAM file, followed by whitespace, followed by the new sample name for that BAM file. + */ + @Advanced + @Argument(fullName = "sample_rename_mapping_file", shortName = "sample_rename_mapping_file", doc = "Rename sample IDs on-the-fly at runtime using the provided mapping file", required = false) + public File sampleRenameMappingFile = null; + /** + * For expert users only who know what they are doing. We do not support usage of this argument, so we may refuse to help you if you use it and something goes wrong. + */ + @Argument(fullName = "unsafe", shortName = "U", doc = "Enable unsafe operations: nothing will be checked at runtime", required = false) + public ValidationExclusion.TYPE unsafe; + /** + * UNSAFE FOR GENERAL USE (FOR TEST SUITE USE ONLY). Disable both auto-generation of index files and index file locking + * when reading VCFs and other rods and an index isn't present or is out-of-date. The file locking necessary for auto index + * generation to work safely is prone to random failures/hangs on certain platforms, which makes it desirable to disable it + * for situations like test suite runs where the indices are already known to exist, however this option is unsafe in general + * because it allows reading from index files without first acquiring a lock. + */ + @Hidden + @Advanced + @Argument(fullName = "disable_auto_index_creation_and_locking_when_reading_rods", shortName = "disable_auto_index_creation_and_locking_when_reading_rods", + doc = "Disable both auto-generation of index files and index file locking", + required = false) + public boolean disableAutoIndexCreationAndLockingWhenReadingRods = false; + + // -------------------------------------------------------------------------------------------------------------- + // + // Multi-threading arguments + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * Data threads contains N cpu threads per data thread, and act as completely data parallel processing, increasing + * the memory usage of GATK by M data threads. Data threads generally scale extremely effectively, up to 24 cores. + * See online documentation FAQs for more information. + */ + @Argument(fullName = "num_threads", shortName = "nt", doc = "Number of data threads to allocate to this analysis", required = false, minValue = 1) + public Integer numberOfDataThreads = 1; + + /** + * Each CPU thread operates the map cycle independently, but may run into earlier scaling problems with IO than + * data threads. Has the benefit of not requiring X times as much memory per thread as data threads do, but rather + * only a constant overhead. See online documentation FAQs for more information. + */ + @Argument(fullName="num_cpu_threads_per_data_thread", shortName = "nct", doc="Number of CPU threads to allocate per data thread", required = false, minValue = 1) + public int numberOfCPUThreadsPerDataThread = 1; + + @Argument(fullName="num_io_threads", shortName = "nit", doc="Number of given threads to allocate to IO", required = false, minValue = 0) + @Hidden + public int numberOfIOThreads = 0; + + /** + * Enable GATK to monitor its own threading efficiency, at an itsy-bitsy tiny + * cost (< 0.1%) in runtime because of turning on the JavaBean. This is largely for + * debugging purposes. Note that this argument is not compatible with -nt, it only works with -nct. + */ + @Argument(fullName = "monitorThreadEfficiency", shortName = "mte", doc = "Enable threading efficiency monitoring", required = false) + public Boolean monitorThreadEfficiency = false; + + @Argument(fullName = "num_bam_file_handles", shortName = "bfh", doc="Total number of BAM file handles to keep open simultaneously", required=false, minValue = 1) + public Integer numberOfBAMFileHandles = null; + /** + * This will filter out read groups matching : (e.g. SM:sample1) or a .txt file containing the filter strings one per line. + */ + @Input(fullName = "read_group_black_list", shortName="rgbl", doc="Exclude read groups based on tags", required = false) + public List readGroupBlackList = null; + + // -------------------------------------------------------------------------------------------------------------- + // + // PED (pedigree) support + // + // -------------------------------------------------------------------------------------------------------------- + + /** + *

Reads PED file-formatted tabular text files describing meta-data about the samples being + * processed in the GATK.

+ * + * + * + *

The PED file is a white-space (space or tab) delimited file: the first six columns are mandatory:

+ * + *
    + *
  • Family ID
  • + *
  • Individual ID
  • + *
  • Paternal ID
  • + *
  • Maternal ID
  • + *
  • Sex (1=male; 2=female; other=unknown)
  • + *
  • Phenotype
  • + *
+ * + *

The IDs are alphanumeric: the combination of family and individual ID should uniquely identify a person. + * A PED file must have 1 and only 1 phenotype in the sixth column. The phenotype can be either a + * quantitative trait or an affection status column: GATK will automatically detect which type + * (i.e. based on whether a value other than 0, 1, 2 or the missing genotype code is observed).

+ * + *

If an individual's sex is unknown, then any character other than 1 or 2 can be used.

+ * + *

You can add a comment to a PED or MAP file by starting the line with a # character. The rest of that + * line will be ignored. Do not start any family IDs with this character therefore.

+ * + *

Affection status should be coded:

+ * + *
    + *
  • -9 missing
  • + *
  • 0 missing
  • + *
  • 1 unaffected
  • + *
  • 2 affected
  • + *
+ * + *

If any value outside of -9,0,1,2 is detected than the samples are assumed + * to phenotype values are interpreted as string phenotype values. In this case -9 uniquely + * represents the missing value.

+ * + *

Genotypes (column 7 onwards) cannot be specified to the GATK.

+ * + *

For example, here are two individuals (one row = one person):

+ * + *
+     *   FAM001  1  0 0  1  2
+     *   FAM001  2  0 0  1  2
+     * 
+ * + *

Each -ped argument can be tagged with NO_FAMILY_ID, NO_PARENTS, NO_SEX, NO_PHENOTYPE to + * tell the GATK PED parser that the corresponding fields are missing from the ped file.

+ * + *

Note that most GATK walkers do not use pedigree information. Walkers that require pedigree + * data should clearly indicate so in their arguments and will throw errors if required pedigree + * information is missing.

+ */ + @Argument(fullName="pedigree", shortName = "ped", doc="Pedigree files for samples",required=false) + public List pedigreeFiles = Collections.emptyList(); + + /** + * Inline PED records (see -ped argument). Each -pedString STRING can contain one or more + * valid PED records (see -ped) separated by semi-colons. Supports all tags for each pedString + * as -ped supports + */ + @Argument(fullName="pedigreeString", shortName = "pedString", doc="Pedigree string for samples",required=false) + public List pedigreeStrings = Collections.emptyList(); + + /** + * How strict should we be in parsing the PED files? + */ + @Argument(fullName="pedigreeValidationType", shortName = "pedValidationType", doc="Validation strictness for pedigree information",required=false) + public PedigreeValidationType pedigreeValidationType = PedigreeValidationType.STRICT; + + // -------------------------------------------------------------------------------------------------------------- + // + // BAM indexing and sharding arguments + // + // -------------------------------------------------------------------------------------------------------------- + /** + * NO INTEGRATION TESTS are available. Use at your own risk. + */ + @Argument(fullName="allow_intervals_with_unindexed_bam",doc="Allow interval processing with an unsupported BAM",required=false) + @Hidden + public boolean allowIntervalsWithUnindexedBAM = false; + + // -------------------------------------------------------------------------------------------------------------- + // + // testing BCF2 + // + // -------------------------------------------------------------------------------------------------------------- + /** + * If provided, whenever we create a VCFWriter we will also write out a BCF file alongside it, for testing purposes. + */ + @Argument(fullName="generateShadowBCF",shortName = "generateShadowBCF",doc="Write a BCF copy of the output VCF",required=false) + @Hidden + public boolean generateShadowBCF = false; + // TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed + + // -------------------------------------------------------------------------------------------------------------- + // + // VCF/BCF index parameters + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * Specify the Tribble indexing strategy to use for VCFs. + * + * LINEAR creates a LinearIndex with bins of equal width, specified by the Bin Width parameter + * INTERVAL creates an IntervalTreeIndex with bins with an equal amount of features, specified by the Features Per Bin parameter + * DYNAMIC_SEEK attempts to optimize for minimal seek time by choosing an appropriate strategy and parameter (user-supplied parameter is ignored) + * DYNAMIC_SIZE attempts to optimize for minimal index size by choosing an appropriate strategy and parameter (user-supplied parameter is ignored) + */ + @Argument(fullName="variant_index_type",shortName = "variant_index_type",doc="Type of IndexCreator to use for VCF/BCF indices",required=false) + @Advanced + public GATKVCFIndexType variant_index_type = GATKVCFUtils.DEFAULT_INDEX_TYPE; + /** + * This is either the bin width or the number of features per bin, depending on the indexing strategy + */ + @Argument(fullName="variant_index_parameter",shortName = "variant_index_parameter",doc="Parameter to pass to the VCF/BCF IndexCreator",required=false) + @Advanced + public int variant_index_parameter = GATKVCFUtils.DEFAULT_INDEX_PARAMETER; +} + diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardVariantContextInputArgumentCollection.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/arguments/StandardVariantContextInputArgumentCollection.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/arguments/StandardVariantContextInputArgumentCollection.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/arguments/StandardVariantContextInputArgumentCollection.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/ValidationExclusion.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/arguments/ValidationExclusion.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/arguments/ValidationExclusion.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/arguments/ValidationExclusion.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContext.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/contexts/AlignmentContext.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContext.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/contexts/AlignmentContext.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/package-info.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/package-info.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/package-info.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/package-info.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/AllLocusView.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/AllLocusView.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/providers/AllLocusView.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/AllLocusView.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/CoveredLocusView.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/CoveredLocusView.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/providers/CoveredLocusView.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/CoveredLocusView.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/IntervalOverlappingRODsFromStream.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/IntervalOverlappingRODsFromStream.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/providers/IntervalOverlappingRODsFromStream.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/IntervalOverlappingRODsFromStream.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/IntervalReferenceOrderedView.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/IntervalReferenceOrderedView.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/providers/IntervalReferenceOrderedView.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/IntervalReferenceOrderedView.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/InvalidPositionException.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/InvalidPositionException.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/providers/InvalidPositionException.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/InvalidPositionException.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusReferenceView.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/LocusReferenceView.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusReferenceView.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/LocusReferenceView.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusShardDataProvider.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/LocusShardDataProvider.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusShardDataProvider.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/LocusShardDataProvider.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RODMetaDataContainer.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/RODMetaDataContainer.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RODMetaDataContainer.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/RODMetaDataContainer.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadReferenceView.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/ReadReferenceView.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadReferenceView.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/ReadReferenceView.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadShardDataProvider.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/ReadShardDataProvider.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadShardDataProvider.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/ReadShardDataProvider.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadView.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/ReadView.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadView.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/ReadView.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedView.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedView.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedView.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedView.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReferenceView.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/ReferenceView.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReferenceView.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/ReferenceView.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ShardDataProvider.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/ShardDataProvider.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ShardDataProvider.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/ShardDataProvider.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/View.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/View.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/providers/View.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/View.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/package-info.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/package-info.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/providers/package-info.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/providers/package-info.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ActiveRegionShardBalancer.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/ActiveRegionShardBalancer.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ActiveRegionShardBalancer.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/ActiveRegionShardBalancer.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMAccessPlan.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/BAMAccessPlan.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMAccessPlan.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/BAMAccessPlan.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BGZFBlockLoadingDispatcher.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/BGZFBlockLoadingDispatcher.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BGZFBlockLoadingDispatcher.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/BGZFBlockLoadingDispatcher.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockLoader.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/BlockLoader.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockLoader.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/BlockLoader.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FileHandleCache.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/FileHandleCache.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FileHandleCache.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/FileHandleCache.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexData.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexData.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexData.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexData.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShard.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/LocusShard.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShard.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/LocusShard.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardBalancer.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/LocusShardBalancer.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardBalancer.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/LocusShardBalancer.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderID.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderID.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderID.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderID.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/Shard.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/Shard.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/Shard.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/Shard.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardBalancer.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/ShardBalancer.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardBalancer.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/ShardBalancer.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/package-info.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/package-info.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/package-info.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/package-info.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/BAMFileStat.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/utilities/BAMFileStat.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/BAMFileStat.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/utilities/BAMFileStat.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/BAMTagRenamer.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/utilities/BAMTagRenamer.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/BAMTagRenamer.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/utilities/BAMTagRenamer.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/PrintBAMRegion.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/utilities/PrintBAMRegion.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/PrintBAMRegion.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/utilities/PrintBAMRegion.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/PrintBGZFBounds.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/utilities/PrintBGZFBounds.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/PrintBGZFBounds.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/utilities/PrintBGZFBounds.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/UnzipSingleBlock.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/utilities/UnzipSingleBlock.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/UnzipSingleBlock.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/utilities/UnzipSingleBlock.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/package-info.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/utilities/package-info.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/package-info.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/utilities/package-info.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/package-info.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reference/package-info.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reference/package-info.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reference/package-info.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/DataStreamSegment.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/rmd/DataStreamSegment.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/DataStreamSegment.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/rmd/DataStreamSegment.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/EntireStream.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/rmd/EntireStream.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/EntireStream.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/rmd/EntireStream.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/MappedStreamSegment.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/rmd/MappedStreamSegment.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/MappedStreamSegment.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/rmd/MappedStreamSegment.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPool.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPool.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPool.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPool.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataSource.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataSource.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataSource.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataSource.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ResourcePool.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/rmd/ResourcePool.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ResourcePool.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/rmd/ResourcePool.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/package-info.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/rmd/package-info.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/package-info.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/rmd/package-info.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsampleType.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/DownsampleType.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsampleType.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/DownsampleType.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/Downsampler.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/Downsampler.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/downsampling/Downsampler.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/Downsampler.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/DownsamplingUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/DownsamplingUtils.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerFactory.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerFactory.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerFactory.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerFactory.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsampler.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/ReadsDownsampler.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsampler.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/ReadsDownsampler.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsamplerFactory.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/ReadsDownsamplerFactory.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsamplerFactory.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/ReadsDownsamplerFactory.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerFactory.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerFactory.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerFactory.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerFactory.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerFactory.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerFactory.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerFactory.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerFactory.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/examples/GATKDocsExample.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/examples/GATKDocsExample.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/examples/GATKDocsExample.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/examples/GATKDocsExample.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/examples/GATKPaperGenotyper.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/examples/GATKPaperGenotyper.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/examples/GATKPaperGenotyper.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/examples/GATKPaperGenotyper.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/Accumulator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/Accumulator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/executive/Accumulator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/Accumulator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroSchedulerMBean.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/HierarchicalMicroSchedulerMBean.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroSchedulerMBean.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/HierarchicalMicroSchedulerMBean.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/MicroScheduler.java new file mode 100644 index 000000000..405c07392 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -0,0 +1,463 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.executive; + +import com.google.java.contract.Ensures; +import net.sf.picard.reference.IndexedFastaSequenceFile; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.ReadMetrics; +import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; +import org.broadinstitute.sting.gatk.datasources.reads.Shard; +import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.io.OutputTracker; +import org.broadinstitute.sting.gatk.iterators.NullSAMIterator; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; +import org.broadinstitute.sting.gatk.traversals.*; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.AutoFormattingTime; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.progressmeter.ProgressMeter; +import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor; + +import javax.management.JMException; +import javax.management.MBeanServer; +import javax.management.ObjectName; +import java.io.File; +import java.lang.management.ManagementFactory; +import java.util.*; + + +/** + * Created by IntelliJ IDEA. + * User: mhanna + * Date: Apr 26, 2009 + * Time: 12:37:23 PM + * + * General base class for all scheduling algorithms + * Shards and schedules data in manageable chunks. + * + * Creates N TraversalEngines for each data thread for the MicroScheduler. This is necessary + * because in the HMS case you have multiple threads executing a traversal engine independently, and + * these engines may need to create separate resources for efficiency or implementation reasons. For example, + * the nanoScheduler creates threads to implement the traversal, and this creation is instance specific. + * So each HMS thread needs to have it's own distinct copy of the traversal engine if it wants to have + * N data threads x M nano threads => N * M threads total. These are borrowed from this microscheduler + * and returned when done. Also allows us to tracks all created traversal engines so this microscheduler + * can properly shut them all down when the scheduling is done. + * + */ +public abstract class MicroScheduler implements MicroSchedulerMBean { + protected static final Logger logger = Logger.getLogger(MicroScheduler.class); + + /** + * The list of all Traversal engines we've created in this micro scheduler + */ + final List allCreatedTraversalEngines = new LinkedList(); + + /** + * All available engines. Engines are borrowed and returned when a subclass is actually + * going to execute the engine on some data. This allows us to have N copies for + * N data parallel executions, but without the dangerous code of having local + * ThreadLocal variables. + */ + final LinkedList availableTraversalEngines = new LinkedList(); + + /** + * Engines that have been allocated to a key already. + */ + final HashMap allocatedTraversalEngines = new HashMap(); + + /** + * Counts the number of instances of the class that are currently alive. + */ + private static int instanceNumber = 0; + + /** + * The engine invoking this scheduler. + */ + protected final GenomeAnalysisEngine engine; + + protected final IndexedFastaSequenceFile reference; + + private final SAMDataSource reads; + protected final Collection rods; + + private final MBeanServer mBeanServer; + private final ObjectName mBeanName; + + /** + * Threading efficiency monitor for tracking the resource utilization of the GATK + * + * may be null + */ + ThreadEfficiencyMonitor threadEfficiencyMonitor = null; + + /** + * MicroScheduler factory function. Create a microscheduler appropriate for reducing the + * selected walker. + * + * @param walker Which walker to use. + * @param reads the informations associated with the reads + * @param reference the reference file + * @param rods the rods to include in the traversal + * @param threadAllocation Number of threads to utilize. + * + * @return The best-fit microscheduler. + */ + public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, ThreadAllocation threadAllocation) { + if ( threadAllocation.isRunningInParallelMode() ) { + logger.info(String.format("Running the GATK in parallel mode with %d total threads, " + + "%d CPU thread(s) for each of %d data thread(s), of %d processors available on this machine", + threadAllocation.getTotalNumThreads(), + threadAllocation.getNumCPUThreadsPerDataThread(), + threadAllocation.getNumDataThreads(), + Runtime.getRuntime().availableProcessors())); + if ( threadAllocation.getTotalNumThreads() > Runtime.getRuntime().availableProcessors() ) + logger.warn(String.format("Number of requested GATK threads %d is more than the number of " + + "available processors on this machine %d", threadAllocation.getTotalNumThreads(), + Runtime.getRuntime().availableProcessors())); + } + + if ( threadAllocation.getNumDataThreads() > 1 ) { + if (walker.isReduceByInterval()) + throw new UserException.BadArgumentValue("nt", String.format("This run of %s is set up to aggregate results by interval. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option or check if this tool has an option to disable per-interval calculations.", engine.getWalkerName(walker.getClass()))); + + if ( ! (walker instanceof TreeReducible) ) { + throw badNT("nt", engine, walker); + } + } + + if ( threadAllocation.getNumCPUThreadsPerDataThread() > 1 && ! (walker instanceof NanoSchedulable) ) { + throw badNT("nct", engine, walker); + } + + if ( threadAllocation.getNumDataThreads() > 1 ) { + return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation); + } else { + return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation); + } + } + + private static UserException badNT(final String parallelArg, final GenomeAnalysisEngine engine, final Walker walker) { + throw new UserException.BadArgumentValue(parallelArg, + String.format("The analysis %s currently does not support parallel execution with %s. " + + "Please run your analysis without the %s option.", engine.getWalkerName(walker.getClass()), parallelArg, parallelArg)); + } + + /** + * Create a microscheduler given the reads and reference. + * + * @param walker the walker to execute with + * @param reads The reads. + * @param reference The reference. + * @param rods the rods to include in the traversal + * @param threadAllocation the allocation of threads to use in the underlying traversal + */ + protected MicroScheduler(final GenomeAnalysisEngine engine, + final Walker walker, + final SAMDataSource reads, + final IndexedFastaSequenceFile reference, + final Collection rods, + final ThreadAllocation threadAllocation) { + this.engine = engine; + this.reads = reads; + this.reference = reference; + this.rods = rods; + + final File progressLogFile = engine.getArguments() == null ? null : engine.getArguments().performanceLog; + + // Creates uninitialized TraversalEngines appropriate for walker and threadAllocation, + // and adds it to the list of created engines for later shutdown. + for ( int i = 0; i < threadAllocation.getNumDataThreads(); i++ ) { + final TraversalEngine traversalEngine = createTraversalEngine(walker, threadAllocation); + allCreatedTraversalEngines.add(traversalEngine); + availableTraversalEngines.add(traversalEngine); + } + + // Create the progress meter, and register it with the analysis engine + engine.registerProgressMeter(new ProgressMeter(progressLogFile, + availableTraversalEngines.peek().getTraversalUnits(), + engine.getRegionsOfGenomeBeingProcessed())); + + // Now that we have a progress meter, go through and initialize the traversal engines + for ( final TraversalEngine traversalEngine : allCreatedTraversalEngines ) + traversalEngine.initialize(engine, walker, engine.getProgressMeter()); + + // JMX does not allow multiple instances with the same ObjectName to be registered with the same platform MXBean. + // To get around this limitation and since we have no job identifier at this point, register a simple counter that + // will count the number of instances of this object that have been created in this JVM. + int thisInstance = instanceNumber++; + mBeanServer = ManagementFactory.getPlatformMBeanServer(); + try { + mBeanName = new ObjectName("org.broadinstitute.sting.gatk.executive:type=MicroScheduler,instanceNumber="+thisInstance); + mBeanServer.registerMBean(this, mBeanName); + } + catch (JMException ex) { + throw new ReviewedStingException("Unable to register microscheduler with JMX", ex); + } + } + + /** + * Really make us a traversal engine of the appropriate type for walker and thread allocation + * + * @return a non-null uninitialized traversal engine + */ + @Ensures("result != null") + private TraversalEngine createTraversalEngine(final Walker walker, final ThreadAllocation threadAllocation) { + if (walker instanceof ReadWalker) { + return new TraverseReadsNano(threadAllocation.getNumCPUThreadsPerDataThread()); + } else if (walker instanceof LocusWalker) { + return new TraverseLociNano(threadAllocation.getNumCPUThreadsPerDataThread()); + } else if (walker instanceof DuplicateWalker) { + return new TraverseDuplicates(); + } else if (walker instanceof ReadPairWalker) { + return new TraverseReadPairs(); + } else if (walker instanceof ActiveRegionWalker) { + return new TraverseActiveRegions(threadAllocation.getNumCPUThreadsPerDataThread()); + } else { + throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type."); + } + } + + + /** + * Return the ThreadEfficiencyMonitor we are using to track our resource utilization, if there is one + * + * @return the monitor, or null if none is active + */ + public ThreadEfficiencyMonitor getThreadEfficiencyMonitor() { + return threadEfficiencyMonitor; + } + + /** + * Inform this Microscheduler to use the efficiency monitor used to create threads in subclasses + * + * @param threadEfficiencyMonitor + */ + public void setThreadEfficiencyMonitor(final ThreadEfficiencyMonitor threadEfficiencyMonitor) { + this.threadEfficiencyMonitor = threadEfficiencyMonitor; + } + + /** + * Should we stop all execution work and exit gracefully? + * + * Returns true in the case where some external signal or time limit has been received, indicating + * that this GATK shouldn't continue executing. This isn't a kill signal, it is really a "shutdown + * gracefully at the next opportunity" signal. Concrete implementations of the MicroScheduler + * examine this value as often as reasonable and, if it returns true, stop what they are doing + * at the next available opportunity, shutdown their resources, call notify done, and return. + * + * @return true if we should abort execution, or false otherwise + */ + protected boolean abortExecution() { + final boolean abort = engine.exceedsRuntimeLimit(); + if ( abort ) { + final AutoFormattingTime aft = new AutoFormattingTime(engine.getRuntimeLimitInNanoseconds(), -1, 4); + logger.info("Aborting execution (cleanly) because the runtime has exceeded the requested maximum " + aft); + } + return abort; + } + + /** + * Walks a walker over the given list of intervals. + * + * @param walker Computation to perform over dataset. + * @param shardStrategy A strategy for sharding the data. + * + * @return the return type of the walker + */ + public abstract Object execute(Walker walker, Iterable shardStrategy); + + /** + * Tells this MicroScheduler that the execution of one of the subclass of this object as started + * + * Must be called when the implementation of execute actually starts up + * + * Currently only starts the progress meter timer running, but other start up activities could be incorporated + */ + protected void startingExecution() { + engine.getProgressMeter().start(); + } + + /** + * Retrieves the object responsible for tracking and managing output. + * @return An output tracker, for loading data in and extracting results. Will not be null. + */ + public abstract OutputTracker getOutputTracker(); + + /** + * Gets the an iterator over the given reads, which will iterate over the reads in the given shard. + * @param shard the shard to use when querying reads. + * @return an iterator over the reads specified in the shard. + */ + protected StingSAMIterator getReadIterator(Shard shard) { + return (!reads.isEmpty()) ? reads.seek(shard) : new NullSAMIterator(); + } + + /** + * Must be called by subclasses when execute is done + */ + protected void executionIsDone() { + engine.getProgressMeter().notifyDone(engine.getCumulativeMetrics().getNumIterations()); + printReadFilteringStats(); + shutdownTraversalEngines(); + + // Print out the threading efficiency of this HMS, if state monitoring is enabled + if ( threadEfficiencyMonitor != null ) { + // include the master thread information + threadEfficiencyMonitor.threadIsDone(Thread.currentThread()); + threadEfficiencyMonitor.printUsageInformation(logger); + } + } + + /** + * Shutdown all of the created engines, and clear the list of created engines, dropping + * pointers to the traversal engines + */ + public synchronized void shutdownTraversalEngines() { + for ( final TraversalEngine te : allCreatedTraversalEngines) + te.shutdown(); + + allCreatedTraversalEngines.clear(); + availableTraversalEngines.clear(); + } + + /** + * Prints out information about number of reads observed and filtering, if any reads were used in the traversal + * + * Looks like: + * + * INFO 10:40:47,370 MicroScheduler - 22 reads were filtered out during traversal out of 101 total (21.78%) + * INFO 10:40:47,370 MicroScheduler - -> 1 reads (0.99% of total) failing BadMateFilter + * INFO 10:40:47,370 MicroScheduler - -> 20 reads (19.80% of total) failing DuplicateReadFilter + * INFO 10:40:47,370 MicroScheduler - -> 1 reads (0.99% of total) failing FailsVendorQualityCheckFilter + */ + private void printReadFilteringStats() { + final ReadMetrics cumulativeMetrics = engine.getCumulativeMetrics(); + if ( cumulativeMetrics.getNumReadsSeen() > 0 ) { + // count up the number of skipped reads by summing over all filters + long nSkippedReads = 0L; + for ( final long countsByFilter : cumulativeMetrics.getCountsByFilter().values()) + nSkippedReads += countsByFilter; + + logger.info(String.format("%d reads were filtered out during the traversal out of approximately %d total reads (%.2f%%)", + nSkippedReads, + cumulativeMetrics.getNumReadsSeen(), + 100.0 * MathUtils.ratio(nSkippedReads, cumulativeMetrics.getNumReadsSeen()))); + + for ( final Map.Entry filterCounts : cumulativeMetrics.getCountsByFilter().entrySet() ) { + long count = filterCounts.getValue(); + logger.info(String.format(" -> %d reads (%.2f%% of total) failing %s", + count, 100.0 * MathUtils.ratio(count,cumulativeMetrics.getNumReadsSeen()), filterCounts.getKey())); + } + } + } + + /** + * Gets the engine that created this microscheduler. + * @return The engine owning this microscheduler. + */ + public GenomeAnalysisEngine getEngine() { return engine; } + + /** + * Returns data source maintained by this scheduler + * @return + */ + public SAMDataSource getSAMDataSource() { return reads; } + + /** + * Returns the reference maintained by this scheduler. + * @return The reference maintained by this scheduler. + */ + public IndexedFastaSequenceFile getReference() { return reference; } + + protected void cleanup() { + try { + mBeanServer.unregisterMBean(mBeanName); + } + catch (JMException ex) { + throw new ReviewedStingException("Unable to unregister microscheduler with JMX", ex); + } + } + + /** + * Returns a traversal engine suitable for use, associated with key + * + * Key is an arbitrary object that is used to retrieve the same traversal + * engine over and over. This can be important in the case where the + * traversal engine has data associated with it in some other context, + * and we need to ensure that the context always sees the same traversal + * engine. This happens in the HierarchicalMicroScheduler, where you want + * the a thread executing traversals to retrieve the same engine each time, + * as outputs are tracked w.r.t. that engine. + * + * If no engine is associated with key yet, pops the next available engine + * from the available ones maintained by this + * microscheduler. Note that it's a runtime error to pop a traversal engine + * from this scheduler if there are none available. Callers that + * once pop'd an engine for use must return it with returnTraversalEngine + * + * @param key the key to associate with this engine + * @return a non-null TraversalEngine suitable for execution in this scheduler + */ + @Ensures("result != null") + protected synchronized TraversalEngine borrowTraversalEngine(final Object key) { + if ( key == null ) throw new IllegalArgumentException("key cannot be null"); + + final TraversalEngine engine = allocatedTraversalEngines.get(key); + if ( engine == null ) { + if ( availableTraversalEngines.isEmpty() ) + throw new IllegalStateException("no traversal engines were available"); + allocatedTraversalEngines.put(key, availableTraversalEngines.pop()); + return allocatedTraversalEngines.get(key); + } else { + return engine; + } + } + + /** + * Return a borrowed traversal engine to this MicroScheduler, for later use + * in another traversal execution + * + * @param key the key used to id the engine, provided to the borrowTraversalEngine function + * @param traversalEngine the borrowed traversal engine. Must have been previously borrowed. + */ + protected synchronized void returnTraversalEngine(final Object key, final TraversalEngine traversalEngine) { + if ( traversalEngine == null ) + throw new IllegalArgumentException("Attempting to push a null traversal engine"); + if ( ! allCreatedTraversalEngines.contains(traversalEngine) ) + throw new IllegalArgumentException("Attempting to push a traversal engine not created by this MicroScheduler" + engine); + if ( ! allocatedTraversalEngines.containsKey(key) ) + throw new IllegalArgumentException("No traversal engine was never checked out with key " + key); + + // note there's nothing to actually do here, but a function implementation + // might want to do something + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroSchedulerMBean.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/MicroSchedulerMBean.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/executive/MicroSchedulerMBean.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/MicroSchedulerMBean.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/OutputMergeTask.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/OutputMergeTask.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/executive/OutputMergeTask.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/OutputMergeTask.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/ReduceTree.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/ReduceTree.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/executive/ReduceTree.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/ReduceTree.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/ShardTraverser.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/ShardTraverser.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/TreeReducer.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/TreeReducer.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/executive/TreeReducer.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/TreeReducer.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/WindowMaker.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/WindowMaker.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/package-info.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/package-info.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/executive/package-info.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/executive/package-info.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/BadCigarFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/BadCigarFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/BadCigarFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/BadCigarFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/BadMateFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/BadMateFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/BadMateFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/BadMateFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/CountingFilteringIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/CountingFilteringIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/CountingFilteringIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/CountingFilteringIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/DuplicateReadFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/DuplicateReadFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/DuplicateReadFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/DuplicateReadFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/FailsVendorQualityCheckFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/FailsVendorQualityCheckFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/FailsVendorQualityCheckFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/FailsVendorQualityCheckFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/FilterManager.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/FilterManager.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/LibraryReadFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/LibraryReadFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/LibraryReadFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/LibraryReadFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/MalformedReadFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/MalformedReadFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/MalformedReadFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/MalformedReadFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/MappingQualityFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/MappingQualityFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityUnavailableFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/MappingQualityUnavailableFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityUnavailableFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/MappingQualityUnavailableFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityZeroFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/MappingQualityZeroFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityZeroFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/MappingQualityZeroFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/MateSameStrandFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/MateSameStrandFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/MateSameStrandFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/MateSameStrandFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/MaxInsertSizeFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/MaxInsertSizeFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/MaxInsertSizeFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/MaxInsertSizeFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/MissingReadGroupFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/MissingReadGroupFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/MissingReadGroupFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/MissingReadGroupFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/NoOriginalQualityScoresFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/NoOriginalQualityScoresFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/NoOriginalQualityScoresFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/NoOriginalQualityScoresFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/NotPrimaryAlignmentFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/NotPrimaryAlignmentFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/NotPrimaryAlignmentFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/NotPrimaryAlignmentFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/Platform454Filter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/Platform454Filter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/Platform454Filter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/Platform454Filter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/PlatformFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/PlatformFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/PlatformFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/PlatformFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/PlatformUnitFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/PlatformUnitFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/PlatformUnitFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/PlatformUnitFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/PlatformUnitFilterHelper.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/PlatformUnitFilterHelper.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/PlatformUnitFilterHelper.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/PlatformUnitFilterHelper.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/ReadFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/ReadFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/ReadFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/ReadFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/ReadGroupBlackListFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/ReadGroupBlackListFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/ReadGroupBlackListFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/ReadGroupBlackListFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/ReadLengthFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/ReadLengthFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/ReadLengthFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/ReadLengthFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/ReadNameFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/ReadNameFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/ReadNameFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/ReadNameFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/ReadStrandFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/ReadStrandFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/ReadStrandFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/ReadStrandFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/ReassignMappingQualityFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/ReassignMappingQualityFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/ReassignMappingQualityFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/ReassignMappingQualityFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/ReassignOneMappingQualityFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/ReassignOneMappingQualityFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/ReassignOneMappingQualityFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/ReassignOneMappingQualityFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/SampleFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/SampleFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/SampleFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/SampleFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/SingleReadGroupFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/SingleReadGroupFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/SingleReadGroupFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/SingleReadGroupFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/UnmappedReadFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/UnmappedReadFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/UnmappedReadFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/UnmappedReadFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/package-info.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/package-info.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/filters/package-info.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/filters/package-info.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/DirectOutputTracker.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/DirectOutputTracker.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/io/DirectOutputTracker.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/DirectOutputTracker.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/FastqFileWriter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/FastqFileWriter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/io/FastqFileWriter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/FastqFileWriter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/OutputTracker.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/OutputTracker.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/io/OutputTracker.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/OutputTracker.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/StingSAMFileWriter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/StingSAMFileWriter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/io/StingSAMFileWriter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/StingSAMFileWriter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/ThreadGroupOutputTracker.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/ThreadGroupOutputTracker.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/io/ThreadGroupOutputTracker.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/ThreadGroupOutputTracker.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/storage/OutputStreamStorage.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/storage/OutputStreamStorage.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/io/storage/OutputStreamStorage.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/storage/OutputStreamStorage.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/storage/Storage.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/storage/Storage.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/io/storage/Storage.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/storage/Storage.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/storage/StorageFactory.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/storage/StorageFactory.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/io/storage/StorageFactory.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/storage/StorageFactory.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/OutputStreamArgumentTypeDescriptor.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/stubs/OutputStreamArgumentTypeDescriptor.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/io/stubs/OutputStreamArgumentTypeDescriptor.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/stubs/OutputStreamArgumentTypeDescriptor.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/OutputStreamStub.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/stubs/OutputStreamStub.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/io/stubs/OutputStreamStub.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/stubs/OutputStreamStub.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileReaderArgumentTypeDescriptor.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/stubs/SAMFileReaderArgumentTypeDescriptor.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileReaderArgumentTypeDescriptor.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/stubs/SAMFileReaderArgumentTypeDescriptor.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterArgumentTypeDescriptor.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterArgumentTypeDescriptor.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterArgumentTypeDescriptor.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterArgumentTypeDescriptor.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterStub.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterStub.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterStub.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterStub.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/Stub.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/stubs/Stub.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/io/stubs/Stub.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/stubs/Stub.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/BoundedReadIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/BoundedReadIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/iterators/BoundedReadIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/BoundedReadIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/GATKSAMIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/GATKSAMIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/iterators/GATKSAMIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/GATKSAMIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/GenomeLocusIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/GenomeLocusIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/iterators/GenomeLocusIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/GenomeLocusIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/IterableIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/IterableIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/iterators/IterableIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/IterableIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/MalformedBAMErrorReformatingIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/MalformedBAMErrorReformatingIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/iterators/MalformedBAMErrorReformatingIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/MalformedBAMErrorReformatingIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/NullSAMIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/NullSAMIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/iterators/NullSAMIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/NullSAMIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/PeekingIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/PeekingIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/iterators/PeekingIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/PeekingIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/PositionTrackingIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/PositionTrackingIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/iterators/PositionTrackingIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/PositionTrackingIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/PushbackIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/PushbackIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/iterators/PushbackIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/PushbackIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadFormattingIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/ReadFormattingIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/iterators/ReadFormattingIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/ReadFormattingIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformersMode.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/ReadTransformersMode.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformersMode.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/ReadTransformersMode.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/StingSAMIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/StingSAMIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/iterators/StingSAMIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/StingSAMIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/StingSAMIteratorAdapter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/StingSAMIteratorAdapter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/iterators/StingSAMIteratorAdapter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/StingSAMIteratorAdapter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/package-info.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/package-info.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/iterators/package-info.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/package-info.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/package-info.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/package-info.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/package-info.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/package-info.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReportException.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/phonehome/GATKRunReportException.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReportException.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/phonehome/GATKRunReportException.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/RODRecordListImpl.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/RODRecordListImpl.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/RODRecordListImpl.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/RODRecordListImpl.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceDependentFeatureCodec.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/ReferenceDependentFeatureCodec.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceDependentFeatureCodec.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/ReferenceDependentFeatureCodec.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedDatum.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedDatum.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedDatum.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedDatum.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/SeekableRODIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/SeekableRODIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/SeekableRODIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/SeekableRODIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/package-info.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/package-info.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/package-info.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/package-info.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/tracks/IndexDictionaryUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/tracks/IndexDictionaryUtils.java new file mode 100644 index 000000000..fbbaa6636 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/tracks/IndexDictionaryUtils.java @@ -0,0 +1,107 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.refdata.tracks; + +import net.sf.samtools.SAMSequenceDictionary; +import net.sf.samtools.SAMSequenceRecord; +import org.apache.log4j.Logger; +import org.broad.tribble.index.Index; +import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; +import org.broadinstitute.sting.utils.SequenceDictionaryUtils; + +import java.util.LinkedHashSet; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; + +/** + * Utilities for working with Sequence Dictionaries embedded in tribble indices + * + * @author Your Name + * @since Date created + */ +public class IndexDictionaryUtils { + private final static Logger logger = Logger.getLogger(IndexDictionaryUtils.class); + + // a constant we use for marking sequence dictionary entries in the Tribble index property list + public static final String SequenceDictionaryPropertyPredicate = "DICT:"; + + /** + * get the sequence dictionary from the track, if available. If not, make it from the contig list that is always in the index + * @param index the index file to use + * @return a SAMSequenceDictionary if available, null if unavailable + */ + public static SAMSequenceDictionary getSequenceDictionaryFromProperties(Index index) { + SAMSequenceDictionary dict = new SAMSequenceDictionary(); + for (Map.Entry entry : index.getProperties().entrySet()) { + if (entry.getKey().startsWith(SequenceDictionaryPropertyPredicate)) + dict.addSequence(new SAMSequenceRecord(entry.getKey().substring(SequenceDictionaryPropertyPredicate.length() , entry.getKey().length()), + Integer.valueOf(entry.getValue()))); + } + return dict; + } + + /** + * create the sequence dictionary with the contig list; a backup approach + * @param index the index file to use + * @param dict the sequence dictionary to add contigs to + * @return the filled-in sequence dictionary + */ + static SAMSequenceDictionary createSequenceDictionaryFromContigList(Index index, SAMSequenceDictionary dict) { + LinkedHashSet seqNames = index.getSequenceNames(); + if (seqNames == null) { + return dict; + } + for (String name : seqNames) { + SAMSequenceRecord seq = new SAMSequenceRecord(name, 0); + dict.addSequence(seq); + } + return dict; + } + + public static void setIndexSequenceDictionary(Index index, SAMSequenceDictionary dict) { + for ( SAMSequenceRecord seq : dict.getSequences() ) { + final String contig = IndexDictionaryUtils.SequenceDictionaryPropertyPredicate + seq.getSequenceName(); + final String length = String.valueOf(seq.getSequenceLength()); + index.addProperty(contig,length); + } + } + + public static void validateTrackSequenceDictionary(final String trackName, + final SAMSequenceDictionary trackDict, + final SAMSequenceDictionary referenceDict, + final ValidationExclusion.TYPE validationExclusionType ) { + // if the sequence dictionary is empty (as well as null which means it doesn't have a dictionary), skip validation + if (trackDict == null || trackDict.size() == 0) + logger.warn("Track " + trackName + " doesn't have a sequence dictionary built in, skipping dictionary validation"); + else { + Set trackSequences = new TreeSet(); + for (SAMSequenceRecord dictionaryEntry : trackDict.getSequences()) + trackSequences.add(dictionaryEntry.getSequenceName()); + SequenceDictionaryUtils.validateDictionaries(logger, validationExclusionType, trackName, trackDict, "reference", referenceDict, false, null); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrack.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrack.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrack.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrack.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java new file mode 100644 index 000000000..a587a3984 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java @@ -0,0 +1,419 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.refdata.tracks; + +import net.sf.samtools.SAMSequenceDictionary; +import org.apache.log4j.Logger; +import org.broad.tribble.AbstractFeatureReader; +import org.broad.tribble.FeatureCodec; +import org.broad.tribble.Tribble; +import org.broad.tribble.TribbleException; +import org.broad.tribble.index.Index; +import org.broad.tribble.index.IndexFactory; +import org.broad.tribble.util.LittleEndianOutputStream; +import org.broad.tribble.util.TabixUtils; +import org.broadinstitute.sting.commandline.Tags; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; +import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; +import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet.RMDStorageType; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.file.FSLockWithShared; +import org.broadinstitute.sting.utils.instrumentation.Sizeof; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; + + +/** + * + * @author aaron + * ` + * Class RMDTrackBuilder + * + * This class keeps track of the available codecs, and knows how to put together a track of + * that gets iterators from the FeatureReader using Tribble. + * + */ +public class RMDTrackBuilder { // extends PluginManager { + /** + * our log, which we use to capture anything from this class + */ + private final static Logger logger = Logger.getLogger(RMDTrackBuilder.class); + + // private sequence dictionary we use to set our tracks with + private final SAMSequenceDictionary dict; + + /** + * Private genome loc parser to use when building out new locs. + */ + private final GenomeLocParser genomeLocParser; + + /** + * Validation exclusions, for validating the sequence dictionary. + */ + private ValidationExclusion.TYPE validationExclusionType; + + private final FeatureManager featureManager; + + // If true, do not attempt to create index files if they don't exist or are outdated, and don't + // make any file lock acquisition calls on the index files. + private final boolean disableAutoIndexCreation; + + /** + * Construct an RMDTrackerBuilder, allowing the user to define tracks to build after-the-fact. This is generally + * used when walkers want to directly manage the ROD system for whatever reason. Before using this constructor, + * please talk through your approach with the SE team. + * @param dict Sequence dictionary to use. + * @param genomeLocParser Location parser to use. + * @param validationExclusionType Types of validations to exclude, for sequence dictionary verification. + * @param disableAutoIndexCreation Do not auto-create index files, and do not use file locking when accessing index files. + * UNSAFE in general (because it causes us not to lock index files before reading them) -- + * suitable only for test suite use. + */ + public RMDTrackBuilder(final SAMSequenceDictionary dict, + final GenomeLocParser genomeLocParser, + final ValidationExclusion.TYPE validationExclusionType, + final boolean disableAutoIndexCreation) { + this.dict = dict; + this.validationExclusionType = validationExclusionType; + this.genomeLocParser = genomeLocParser; + this.featureManager = new FeatureManager(GenomeAnalysisEngine.lenientVCFProcessing(validationExclusionType)); + this.disableAutoIndexCreation = disableAutoIndexCreation; + } + + /** + * Return the feature manager this RMDTrackBuilder is using the create tribble tracks + * + * @return + */ + public FeatureManager getFeatureManager() { + return featureManager; + } + + /** + * create a RMDTrack of the specified type + * + * @param fileDescriptor a description of the type of track to build. + * + * @return an instance of the track + */ + public RMDTrack createInstanceOfTrack(RMDTriplet fileDescriptor) { + String name = fileDescriptor.getName(); + File inputFile = new File(fileDescriptor.getFile()); + + FeatureManager.FeatureDescriptor descriptor = getFeatureManager().getByTriplet(fileDescriptor); + if (descriptor == null) + throw new UserException.BadArgumentValue("-B",fileDescriptor.getType()); + + // return a feature reader track + Pair pair; + if (inputFile.getAbsolutePath().endsWith(".gz")) + pair = createTabixIndexedFeatureSource(descriptor, name, inputFile); + else + pair = getFeatureSource(descriptor, name, inputFile, fileDescriptor.getStorageType()); + if (pair == null) throw new UserException.CouldNotReadInputFile(inputFile, "Unable to make the feature reader for input file"); + return new RMDTrack(descriptor.getCodecClass(), name, inputFile, pair.first, pair.second, genomeLocParser, createCodec(descriptor, name)); + } + + /** + * Convenience method simplifying track creation. Assume unnamed track based on a file rather than a stream. + * @param codecClass Type of Tribble codec class to build. + * @param inputFile Input file type to use. + * @return An RMDTrack, suitable for accessing reference metadata. + */ + public RMDTrack createInstanceOfTrack(Class codecClass, File inputFile) { + final FeatureManager.FeatureDescriptor descriptor = getFeatureManager().getByCodec(codecClass); + + if (descriptor == null) + throw new ReviewedStingException("Unable to find type name for codec class " + codecClass.getName()); + + return createInstanceOfTrack(new RMDTriplet("anonymous",descriptor.getName(),inputFile.getAbsolutePath(),RMDStorageType.FILE,new Tags())); + } + + /** + * create a feature reader, without assuming there exists an index. This code assumes the feature + * reader of the appropriate type will figure out what the right index type is, and determine if it + * exists. + * + * @param descriptor the FeatureDescriptor describing the FeatureCodec we want to create + * @param name the name of the track + * @param inputFile the file to load + * @return a feature reader implementation + */ + private Pair createTabixIndexedFeatureSource(FeatureManager.FeatureDescriptor descriptor, String name, File inputFile) { + // we might not know the index type, try loading with the default reader constructor + logger.debug("Attempting to load " + inputFile + " as a tabix indexed file without validating it"); + try { + final File indexFile = new File(inputFile.getAbsoluteFile() + TabixUtils.STANDARD_INDEX_EXTENSION); + final SAMSequenceDictionary dict = TabixUtils.getSequenceDictionary(indexFile); + return new Pair<>(AbstractFeatureReader.getFeatureReader(inputFile.getAbsolutePath(), createCodec(descriptor, name)), dict); + } catch (TribbleException e) { + throw new UserException(e.getMessage(), e); + } + } + + /** + * add a name to the codec, if it takes one + * @param descriptor the class to create a codec for + * @param name the name to assign this codec + * @return the feature codec itself + */ + private FeatureCodec createCodec(FeatureManager.FeatureDescriptor descriptor, String name) { + return featureManager.createCodec(descriptor, name, genomeLocParser); + } + + /** + * create a feature source object given: + * @param descriptor the FeatureDescriptor describing the FeatureCodec we want to create + * @param name the name of the codec + * @param inputFile the tribble file to parse + * @param storageType How the RMD is streamed into the input file. + * @return the input file as a FeatureReader + */ + private Pair getFeatureSource(FeatureManager.FeatureDescriptor descriptor, + String name, + File inputFile, + RMDStorageType storageType) { + // Feature source and sequence dictionary to use as the ultimate reference + AbstractFeatureReader featureSource = null; + SAMSequenceDictionary sequenceDictionary = null; + + // Detect whether or not this source should be indexed. + boolean canBeIndexed = (storageType == RMDStorageType.FILE); + + if(canBeIndexed) { + try { + Index index = loadIndex(inputFile, createCodec(descriptor, name)); + try { logger.info(String.format(" Index for %s has size in bytes %d", inputFile, Sizeof.getObjectGraphSize(index))); } + catch (ReviewedStingException e) { } + + sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index); + + // if we don't have a dictionary in the Tribble file, and we've set a dictionary for this builder, set it in the file if they match + if (sequenceDictionary.size() == 0 && dict != null) { + validateAndUpdateIndexSequenceDictionary(inputFile, index, dict); + + if ( ! disableAutoIndexCreation ) { + File indexFile = Tribble.indexFile(inputFile); + try { // re-write the index + writeIndexToDisk(index,indexFile,new FSLockWithShared(indexFile)); + } catch (IOException e) { + logger.warn("Unable to update index with the sequence dictionary for file " + indexFile + "; this will not affect your run of the GATK"); + } + } + + sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index); + } + + featureSource = AbstractFeatureReader.getFeatureReader(inputFile.getAbsolutePath(), createCodec(descriptor, name), index); + } + catch (TribbleException e) { + throw new UserException(e.getMessage()); + } + catch (IOException e) { + throw new UserException("I/O error loading or writing tribble index file for " + inputFile.getAbsolutePath(), e); + } + } + else { + featureSource = AbstractFeatureReader.getFeatureReader(inputFile.getAbsolutePath(), createCodec(descriptor, name), false); + } + + return new Pair(featureSource,sequenceDictionary); + } + + /** + * create an index for the input file + * @param inputFile the input file + * @param codec the codec to use + * @return a linear index for the specified type + * @throws IOException if we cannot write the index file + */ + public synchronized Index loadIndex( final File inputFile, final FeatureCodec codec) throws IOException { + final File indexFile = Tribble.indexFile(inputFile); + final FSLockWithShared lock = new FSLockWithShared(indexFile); + Index idx = null; + + // If the index file exists and is readable, attempt to load it from disk. We'll get null back + // if a problem was discovered with the index file when it was inspected, and we'll get an + // in-memory index back in the case where the index file could not be locked. + if (indexFile.canRead()) { + idx = disableAutoIndexCreation ? loadFromDisk(inputFile, indexFile) // load without locking if we're in disableAutoIndexCreation mode + : attemptToLockAndLoadIndexFromDisk(inputFile, codec, indexFile, lock); + } + + // If we have an index, it means we either loaded it from disk without issue or we created an in-memory + // index due to not being able to acquire a lock. + if (idx != null) return idx; + + // We couldn't read the file, or we discovered a problem with the index file, so continue on to making a new index + idx = createIndexInMemory(inputFile, codec); + if ( ! disableAutoIndexCreation ) { + writeIndexToDisk(idx, indexFile, lock); + } + return idx; + } + + /** + * Attempt to acquire a shared lock and then load the index from disk. Returns an in-memory index if + * a lock could not be obtained. Returns null if a problem was discovered with the index file when it + * was examined (eg., it was out-of-date). + * + * @param inputFile the input file + * @param codec the codec to read from + * @param indexFile the index file itself + * @param lock the lock file + * @return an index, or null if we couldn't load one + * @throws IOException if we fail for FS issues + */ + protected Index attemptToLockAndLoadIndexFromDisk( final File inputFile, final FeatureCodec codec, final File indexFile, final FSLockWithShared lock ) throws IOException { + boolean locked = false; + Index idx = null; + + try { + locked = lock.sharedLock(); + + if ( ! locked ) { // can't lock file + logger.info(String.format("Could not acquire a shared lock on index file %s, falling back to using an in-memory index for this GATK run.", + indexFile.getAbsolutePath())); + idx = createIndexInMemory(inputFile, codec); + } + else { + idx = loadFromDisk(inputFile, indexFile); + } + } finally { + if (locked) lock.unlock(); + } + return idx; + } + + /** + * load the index from disk, checking for out of date indexes and old versions (both of which are deleted) + * @param inputFile the input file + * @param indexFile the input file, plus the index extension + * @return an Index, or null if we're unable to load + */ + protected Index loadFromDisk( final File inputFile, final File indexFile ) { + logger.debug("Loading Tribble index from disk for file " + inputFile); + Index index = IndexFactory.loadIndex(indexFile.getAbsolutePath()); + + // check if the file is up-to date (filestamp and version check) + if (index.isCurrentVersion() && indexFile.lastModified() >= inputFile.lastModified()) + return index; + else if (indexFile.lastModified() < inputFile.lastModified()) + logger.warn("Index file " + indexFile + " is out of date (index older than input file), " + + (disableAutoIndexCreation ? "falling back to an in-memory index" : "deleting and updating the index file")); + else // we've loaded an old version of the index, we want to remove it <-- currently not used, but may re-enable + logger.warn("Index file " + indexFile + " is out of date (old version), " + + (disableAutoIndexCreation ? "falling back to an in-memory index" : "deleting and updating the index file")); + + if ( ! disableAutoIndexCreation ) { + boolean deleted = indexFile.delete(); + if (!deleted) logger.warn("Index file " + indexFile + " is out of date, but could not be removed; it will not be trusted (we'll try to rebuild an in-memory copy)"); + } + + return null; + } + + + /** + * attempt to write the index to disk + * @param index the index to write to disk + * @param indexFile the index file location + * @param lock the locking object + * @throws IOException when unable to create the new index + */ + private void writeIndexToDisk( final Index index, final File indexFile, final FSLockWithShared lock ) throws IOException { + if ( disableAutoIndexCreation ) { + return; + } + + boolean locked = false; + + try { + locked = lock.exclusiveLock(); + + if (locked) { + logger.info("Writing Tribble index to disk for file " + indexFile); + LittleEndianOutputStream stream = new LittleEndianOutputStream(new FileOutputStream(indexFile)); + index.write(stream); + stream.close(); + } + else // we can't write it to disk, just store it in memory, tell them this + logger.warn("Unable to write to " + indexFile + " for the index file, creating index in memory only"); + + try { logger.info(String.format(" Index for %s has size in bytes %d", indexFile, Sizeof.getObjectGraphSize(index))); } + catch ( ReviewedStingException e) { } + } + finally { + if (locked) lock.unlock(); + } + + } + + /** + * create the index in memory, given the input file and feature codec + * @param inputFile the input file + * @param codec the codec + * @return a LinearIndex, given the file location + * @throws IOException when unable to create the index in memory + */ + protected Index createIndexInMemory(File inputFile, FeatureCodec codec) { + // this can take a while, let them know what we're doing + logger.debug("Creating Tribble index in memory for file " + inputFile); + Index idx = IndexFactory.createDynamicIndex(inputFile, codec, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); + validateAndUpdateIndexSequenceDictionary(inputFile, idx, dict); + return idx; + } + + /** + * set the sequence dictionary of the track. This function checks that the contig listing of the underlying file is compatible. + * (that each contig in the index is in the sequence dictionary). + * @param inputFile for proper error message formatting. + * @param dict the sequence dictionary + * @param index the index file + */ + public void validateAndUpdateIndexSequenceDictionary(final File inputFile, final Index index, final SAMSequenceDictionary dict) { + if (dict == null) throw new ReviewedStingException("BUG: dict cannot be null"); + + // check that every contig in the RMD contig list is at least in the sequence dictionary we're being asked to set + final SAMSequenceDictionary currentDict = IndexDictionaryUtils.createSequenceDictionaryFromContigList(index, new SAMSequenceDictionary()); + validateTrackSequenceDictionary(inputFile.getAbsolutePath(), currentDict, dict); + + // actually update the dictionary in the index + IndexDictionaryUtils.setIndexSequenceDictionary(index, dict); + } + + public void validateTrackSequenceDictionary(final String trackName, + final SAMSequenceDictionary trackDict, + final SAMSequenceDictionary referenceDict ) { + IndexDictionaryUtils.validateTrackSequenceDictionary(trackName, trackDict, referenceDict, validationExclusionType); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/FeatureToGATKFeatureIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/utils/FeatureToGATKFeatureIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/utils/FeatureToGATKFeatureIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/utils/FeatureToGATKFeatureIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/FlashBackIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/utils/FlashBackIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/utils/FlashBackIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/utils/FlashBackIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/GATKFeature.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/utils/GATKFeature.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/utils/GATKFeature.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/utils/GATKFeature.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/LocationAwareSeekableRODIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/utils/LocationAwareSeekableRODIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/utils/LocationAwareSeekableRODIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/utils/LocationAwareSeekableRODIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/RMDTriplet.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/utils/RMDTriplet.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/utils/RMDTriplet.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/utils/RMDTriplet.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/RODRecordList.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/utils/RODRecordList.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/utils/RODRecordList.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/utils/RODRecordList.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/report/GATKReport.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/report/GATKReport.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/report/GATKReportColumn.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/report/GATKReportColumn.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumnFormat.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/report/GATKReportColumnFormat.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumnFormat.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/report/GATKReportColumnFormat.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportDataType.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/report/GATKReportDataType.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/report/GATKReportDataType.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/report/GATKReportDataType.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportGatherer.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/report/GATKReportGatherer.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/report/GATKReportGatherer.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/report/GATKReportGatherer.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/report/GATKReportTable.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/report/GATKReportTable.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/report/GATKReportVersion.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/report/GATKReportVersion.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Affection.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/samples/Affection.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/samples/Affection.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/samples/Affection.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Gender.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/samples/Gender.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/samples/Gender.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/samples/Gender.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/samples/PedReader.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/samples/PedReader.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/samples/Sample.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/samples/Sample.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/samples/SampleDB.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/samples/SampleDB.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Trio.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/samples/Trio.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/samples/Trio.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/samples/Trio.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCache.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCache.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCache.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCache.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/package-info.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/traversals/package-info.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/traversals/package-info.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/traversals/package-info.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionTraversalParameters.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/ActiveRegionTraversalParameters.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionTraversalParameters.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/ActiveRegionTraversalParameters.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Allows.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/Allows.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/Allows.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/Allows.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Attribution.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/Attribution.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/Attribution.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/Attribution.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/BAQMode.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/BAQMode.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/BAQMode.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/BAQMode.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/By.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/By.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/By.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/By.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/DataSource.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/DataSource.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/DataSource.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/DataSource.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Downsample.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/Downsample.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/Downsample.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/Downsample.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/LocusWalker.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/LocusWalker.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Multiplex.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/Multiplex.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/Multiplex.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/Multiplex.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Multiplexer.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/Multiplexer.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/Multiplexer.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/Multiplexer.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/NanoSchedulable.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/NanoSchedulable.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/NanoSchedulable.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/NanoSchedulable.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PartitionBy.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/PartitionBy.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/PartitionBy.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/PartitionBy.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PartitionType.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/PartitionType.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/PartitionType.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/PartitionType.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/RMD.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/RMD.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/RMD.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/RMD.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadFilters.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/ReadFilters.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/ReadFilters.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/ReadFilters.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadPairWalker.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/ReadPairWalker.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/ReadPairWalker.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/ReadPairWalker.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/ReadWalker.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/ReadWalker.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/RefWalker.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/RefWalker.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/RefWalker.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/RefWalker.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Reference.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/Reference.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/Reference.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/Reference.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/RemoveProgramRecords.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/RemoveProgramRecords.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/RemoveProgramRecords.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/RemoveProgramRecords.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Requires.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/Requires.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/Requires.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/Requires.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/RodWalker.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/RodWalker.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/RodWalker.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/RodWalker.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/TreeReducible.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/TreeReducible.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/TreeReducible.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/TreeReducible.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/Walker.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/Walker.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/WalkerName.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/WalkerName.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/WalkerName.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/WalkerName.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Window.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/Window.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/Window.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/Window.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseCounts.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/BaseCounts.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseCounts.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/BaseCounts.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCountConstants.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCountConstants.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCountConstants.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCountConstants.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/NBaseCount.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/NBaseCount.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/NBaseCount.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/NBaseCount.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUtil.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUtil.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUtil.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUtil.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantOverlapAnnotator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/VariantOverlapAnnotator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantOverlapAnnotator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/VariantOverlapAnnotator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotationInterfaceManager.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotationInterfaceManager.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotationInterfaceManager.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotationInterfaceManager.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotationType.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotationType.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotationType.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotationType.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatible.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatible.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatible.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatible.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ExperimentalAnnotation.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ExperimentalAnnotation.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ExperimentalAnnotation.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ExperimentalAnnotation.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/GenotypeAnnotation.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/GenotypeAnnotation.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/GenotypeAnnotation.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/GenotypeAnnotation.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/InfoFieldAnnotation.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/InfoFieldAnnotation.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/InfoFieldAnnotation.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/InfoFieldAnnotation.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/RodRequiringAnnotation.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/RodRequiringAnnotation.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/RodRequiringAnnotation.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/RodRequiringAnnotation.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/StandardAnnotation.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/StandardAnnotation.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/StandardAnnotation.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/StandardAnnotation.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/WorkInProgressAnnotation.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/WorkInProgressAnnotation.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/WorkInProgressAnnotation.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/WorkInProgressAnnotation.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCF.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCF.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCF.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCF.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInput.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInput.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInput.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInput.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/VariantsToBeagleUnphased.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/beagle/VariantsToBeagleUnphased.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/VariantsToBeagleUnphased.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/beagle/VariantsToBeagleUnphased.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CompareCallableLoci.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/coverage/CompareCallableLoci.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CompareCallableLoci.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/coverage/CompareCallableLoci.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java new file mode 100644 index 000000000..3a51a9a6a --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java @@ -0,0 +1,1109 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.walkers.coverage; + +import net.sf.samtools.SAMReadGroupRecord; +import org.broadinstitute.sting.commandline.Advanced; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator; +import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; +import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; +import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; +import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator; +import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.codecs.refseq.RefSeqCodec; +import org.broadinstitute.sting.utils.codecs.refseq.RefSeqFeature; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; + +import java.io.File; +import java.io.PrintStream; +import java.util.*; + +/** + * Assess sequence coverage by a wide array of metrics, partitioned by sample, read group, or library + * + *

+ * This tool processes a set of bam files to determine coverage at different levels of partitioning and + * aggregation. Coverage can be analyzed per locus, per interval, per gene, or in total; can be partitioned by + * sample, by read group, by technology, by center, or by library; and can be summarized by mean, median, quartiles, + * and/or percentage of bases covered to or beyond a threshold. + * Additionally, reads and bases can be filtered by mapping or base quality score. + * + *

Input

+ *

+ * One or more bam files (with proper headers) to be analyzed for coverage statistics + *

+ *

+ *(Optional) A REFSEQ Rod to aggregate coverage to the gene level + *

+ * (for information about creating the REFSEQ Rod, please consult the online documentation) + *

+ *

Output

+ *

+ * Tables pertaining to different coverage summaries. Suffix on the table files declares the contents: + *

+ * - no suffix: per locus coverage + *

+ * - _summary: total, mean, median, quartiles, and threshold proportions, aggregated over all bases + *

+ * - _statistics: coverage histograms (# locus with X coverage), aggregated over all bases + *

+ * - _interval_summary: total, mean, median, quartiles, and threshold proportions, aggregated per interval + *

+ * - _interval_statistics: 2x2 table of # of intervals covered to >= X depth in >=Y samples + *

+ * - _gene_summary: total, mean, median, quartiles, and threshold proportions, aggregated per gene + *

+ * - _gene_statistics: 2x2 table of # of genes covered to >= X depth in >= Y samples + *

+ * - _cumulative_coverage_counts: coverage histograms (# locus with >= X coverage), aggregated over all bases + *

+ * - _cumulative_coverage_proportions: proprotions of loci with >= X coverage, aggregated over all bases + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T DepthOfCoverage \
+ *   -o file_name_base \
+ *   -I input_bams.list
+ *   [-geneList refSeq.sorted.txt] \
+ *   [-pt readgroup] \
+ *   [-ct 4 -ct 6 -ct 10] \
+ *   [-L my_capture_genes.interval_list]
+ * 
+ * + */ +// todo -- cache the map from sample names to means in the print functions, rather than regenerating each time +// todo -- support for granular histograms for total depth; maybe n*[start,stop], bins*sqrt(n) +// todo -- alter logarithmic scaling to spread out bins more +// todo -- allow for user to set linear binning (default is logarithmic) +// todo -- formatting --> do something special for end bins in getQuantile(int[] foo), this gets mushed into the end+-1 bins for now +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class}, gotoDev = HelpConstants.MC) +@By(DataSource.REFERENCE) +@PartitionBy(PartitionType.NONE) +@Downsample(by= DownsampleType.NONE, toCoverage=Integer.MAX_VALUE) +public class DepthOfCoverage extends LocusWalker>, CoveragePartitioner> implements TreeReducible { + @Output + @Multiplex(value=DoCOutputMultiplexer.class,arguments={"partitionTypes","refSeqGeneList","omitDepthOutput","omitIntervals","omitSampleSummary","omitLocusTable"}) + Map out; + /** + * Reads with mapping quality values lower than this threshold will be skipped. This is set to -1 by default to disable the evaluation and ignore this threshold. + */ + @Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth", required = false, minValue = 0, maxValue = Integer.MAX_VALUE) + int minMappingQuality = -1; + /** + * Reads with mapping quality values higher than this threshold will be skipped. The default value is the largest number that can be represented as an integer by the program. + */ + @Argument(fullName = "maxMappingQuality", doc = "Maximum mapping quality of reads to count towards depth", required = false, minValue = 0, maxValue = Integer.MAX_VALUE) + int maxMappingQuality = Integer.MAX_VALUE; + /** + * Bases with quality scores lower than this threshold will be skipped. This is set to -1 by default to disable the evaluation and ignore this threshold. + */ + @Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth", required = false, minValue = 0, maxValue = Byte.MAX_VALUE) + byte minBaseQuality = -1; + /** + * Bases with quality scores higher than this threshold will be skipped. The default value is the largest number that can be represented as a byte. + */ + @Argument(fullName = "maxBaseQuality", doc = "Maximum quality of bases to count towards depth", required = false, minValue = 0, maxValue = Byte.MAX_VALUE) + byte maxBaseQuality = Byte.MAX_VALUE; + + @Argument(fullName = "countType", doc = "How should overlapping reads from the same fragment be handled?", required = false) + CoverageUtils.CountPileupType countType = CoverageUtils.CountPileupType.COUNT_READS; + + /** + * Instead of reporting depth, the program will report the base pileup at each locus + */ + @Argument(fullName = "printBaseCounts", shortName = "baseCounts", doc = "Add base counts to per-locus output", required = false) + boolean printBaseCounts = false; + + /** + * Disabling the tabulation of locus statistics (# loci covered by sample by coverage) should speed up processing. + */ + @Argument(fullName = "omitLocusTable", shortName = "omitLocusTable", doc = "Do not calculate per-sample per-depth counts of loci", required = false) + boolean omitLocusTable = false; + + /** + * Disabling the tabulation of interval statistics (mean, median, quartiles AND # intervals by sample by coverage) should speed up processing. This option is required in order to use -nt parallelism. + */ + @Argument(fullName = "omitIntervalStatistics", shortName = "omitIntervals", doc = "Do not calculate per-interval statistics", required = false) + boolean omitIntervals = false; + /** + * Disabling the tabulation of total coverage at every base should speed up processing. + */ + @Argument(fullName = "omitDepthOutputAtEachBase", shortName = "omitBaseOutput", doc = "Do not output depth of coverage at each base", required = false) + boolean omitDepthOutput = false; + + /** + * Specify a RefSeq file for use in aggregating coverage statistics over genes. + */ + @Argument(fullName = "calculateCoverageOverGenes", shortName = "geneList", doc = "Calculate coverage statistics over this list of genes", required = false) + File refSeqGeneList = null; + + /** + * Output file format (e.g. csv, table, rtable); defaults to r-readable table. + */ + @Argument(fullName = "outputFormat", doc = "The format of the output file", required = false) + String outputFormat = "rtable"; + + + // --------------------------------------------------------------------------- + // + // Advanced arguments + // + // --------------------------------------------------------------------------- + + /** + * Normally, sites where the reference is N (or another non-canonical base) are skipped. If this option is enabled, these sites will be included in DoC calculations if there is coverage from neighboring reads. + */ + @Advanced + @Argument(fullName = "includeRefNSites", doc = "Include sites where the reference is N", required = false) + boolean includeRefNBases = false; + /** + * Use this option to calibrate what bins you want before performing full calculations on your data. + */ + @Advanced + @Argument(fullName = "printBinEndpointsAndExit", doc = "Print the bin values and exit immediately", required = false) + boolean printBinEndpointsAndExit = false; + /** + * Sets the low-coverage cutoff for granular binning. All loci with depth < START are counted in the first bin. + */ + @Advanced + @Argument(fullName = "start", doc = "Starting (left endpoint) for granular binning", required = false, minValue = 0) + int start = 1; + /** + * Sets the high-coverage cutoff for granular binning. All loci with depth > STOP are counted in the last bin. + */ + @Advanced + @Argument(fullName = "stop", doc = "Ending (right endpoint) for granular binning", required = false, minValue = 1) + int stop = 500; + /** + * Sets the number of bins for granular binning + */ + @Advanced + @Argument(fullName = "nBins", doc = "Number of bins to use for granular binning", required = false, minValue = 0, minRecommendedValue = 1) + int nBins = 499; + + /** + * This option simply disables writing separate files for per-sample summary statistics (total, mean, median, quartile coverage per sample). These statistics are still calculated internally, so enabling this option will not improve runtime. + */ + @Argument(fullName = "omitPerSampleStats", shortName = "omitSampleSummary", doc = "Do not output the summary files per-sample", required = false) + boolean omitSampleSummary = false; + /** + * By default, coverage is partitioning by sample, but it can be any combination of sample, readgroup and/or library. + */ + @Argument(fullName = "partitionType", shortName = "pt", doc = "Partition type for depth of coverage", required = false) + Set partitionTypes = EnumSet.of(DoCOutputType.Partition.sample); + + /** + * Consider a spanning deletion as contributing to coverage. Also enables deletion counts in per-base output. + */ + @Advanced + @Argument(fullName = "includeDeletions", shortName = "dels", doc = "Include information on deletions", required = false) + boolean includeDeletions = false; + + @Advanced + @Argument(fullName = "ignoreDeletionSites", doc = "Ignore sites consisting only of deletions", required = false) + boolean ignoreDeletionSites = false; + + /** + * For summary file outputs, report the percentage of bases covered to an amount equal to or greater than this number (e.g. % bases >= CT for each sample). Defaults to 15; can take multiple arguments. + */ + @Advanced + @Argument(fullName = "summaryCoverageThreshold", shortName = "ct", doc = "Coverage threshold (in percent) for summarizing statistics", required = false) + int[] coverageThresholds = {15}; + + String[] OUTPUT_FORMATS = {"table","rtable","csv"}; + String separator = "\t"; + Map> orderCheck = new HashMap>(); + + //////////////////////////////////////////////////////////////////////////////////// + // STANDARD WALKER METHODS + //////////////////////////////////////////////////////////////////////////////////// + + public boolean includeReadsWithDeletionAtLoci() { return includeDeletions && ! ignoreDeletionSites; } + + public void initialize() { + + if ( printBinEndpointsAndExit ) { + int[] endpoints = DepthOfCoverageStats.calculateBinEndpoints(start,stop,nBins); + System.out.print("[ "); + for ( int e : endpoints ) { + System.out.print(e+" "); + } + System.out.println("]"); + System.exit(0); + } + + // Check the output format + boolean goodOutputFormat = false; + for ( String f : OUTPUT_FORMATS ) { + goodOutputFormat = goodOutputFormat || f.equals(outputFormat); + } + + if ( ! goodOutputFormat ) { + throw new IllegalArgumentException("Improper output format. Can be one of table,rtable,csv. Was "+outputFormat); + } + + if ( outputFormat.equals("csv") ) { + separator = ","; + } + + if ( ! omitDepthOutput ) { // print header + PrintStream out = getCorrectStream(null, DoCOutputType.Aggregation.locus, DoCOutputType.FileType.summary); + out.printf("%s\t%s","Locus","Total_Depth"); + for (DoCOutputType.Partition type : partitionTypes ) { + out.printf("\t%s_%s","Average_Depth",type.toString()); + } + + // get all the samples + HashSet allSamples = getSamplesFromToolKit(partitionTypes); + ArrayList allSampleList = new ArrayList(allSamples.size()); + for ( String s : allSamples ) { + allSampleList.add(s); + } + Collections.sort(allSampleList); + + for ( String s : allSampleList) { + out.printf("\t%s_%s","Depth_for",s); + if ( printBaseCounts ) { + out.printf("\t%s_%s",s,"base_counts"); + } + } + + out.printf("%n"); + + } else { + logger.info("Per-Locus Depth of Coverage output was omitted"); + } + + for (DoCOutputType.Partition type : partitionTypes ) { + orderCheck.put(type,new ArrayList()); + for ( String id : getSamplesFromToolKit(type) ) { + orderCheck.get(type).add(id); + } + Collections.sort(orderCheck.get(type)); + } + } + + private HashSet getSamplesFromToolKit( Collection types ) { + HashSet partitions = new HashSet(); // since the DOCS object uses a HashMap, this will be in the same order + for (DoCOutputType.Partition t : types ) { + partitions.addAll(getSamplesFromToolKit(t)); + } + + return partitions; + } + + private HashSet getSamplesFromToolKit(DoCOutputType.Partition type) { + HashSet partition = new HashSet(); + if ( type == DoCOutputType.Partition.sample ) { + partition.addAll(SampleUtils.getSAMFileSamples(getToolkit())); + } else if ( type == DoCOutputType.Partition.readgroup ) { + for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { + partition.add(rg.getSample()+"_rg_"+rg.getReadGroupId()); + } + } else if ( type == DoCOutputType.Partition.library ) { + for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { + partition.add(rg.getLibrary()); + } + } else if ( type == DoCOutputType.Partition.center ) { + for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { + partition.add(rg.getSequencingCenter()); + } + } else if ( type == DoCOutputType.Partition.platform ) { + for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { + partition.add(rg.getPlatform()); + } + } else if ( type == DoCOutputType.Partition.sample_by_center ) { + for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { + partition.add(String.format("%s_cn_%s",rg.getSample(),rg.getSequencingCenter())); + } + } else if ( type == DoCOutputType.Partition.sample_by_platform ) { + for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { + partition.add(String.format("%s_pl_%s",rg.getSample(),rg.getPlatform())); + } + } else if ( type == DoCOutputType.Partition.sample_by_platform_by_center ) { + for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { + partition.add(String.format("%s_pl_%s_cn_%s",rg.getSample(),rg.getPlatform(),rg.getSequencingCenter())); + } + } else { + throw new ReviewedStingException("Invalid aggregation type sent to getSamplesFromToolKit"); + } + + return partition; + } + + public boolean isReduceByInterval() { + return ( ! omitIntervals ); + } + + public CoveragePartitioner reduceInit() { + CoveragePartitioner aggro = new CoveragePartitioner(partitionTypes,start,stop,nBins); + for (DoCOutputType.Partition t : partitionTypes ) { + aggro.addIdentifiers(t,getSamplesFromToolKit(t)); + } + aggro.initialize(includeDeletions,omitLocusTable); + checkOrder(aggro); + return aggro; + } + + public Map> map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if (includeRefNBases || BaseUtils.isRegularBase(ref.getBase())) { + if ( ! omitDepthOutput ) { + getCorrectStream(null, DoCOutputType.Aggregation.locus, DoCOutputType.FileType.summary).printf("%s",ref.getLocus()); // yes: print locus in map, and the rest of the info in reduce (for eventual cumulatives) + //System.out.printf("\t[log]\t%s",ref.getLocus()); + } + + return CoverageUtils.getBaseCountsByPartition(context,minMappingQuality,maxMappingQuality,minBaseQuality,maxBaseQuality,countType,partitionTypes); + } else { + return null; + } + } + + public CoveragePartitioner reduce(Map> thisMap, CoveragePartitioner prevReduce) { + if ( thisMap != null ) { // skip sites we didn't want to include in the calculation (ref Ns) + if ( ! omitDepthOutput ) { + //checkOrder(prevReduce); // tests prevReduce.getIdentifiersByType().get(t) against the initialized header order + printDepths(getCorrectStream(null, DoCOutputType.Aggregation.locus, DoCOutputType.FileType.summary),thisMap,prevReduce.getIdentifiersByType()); + // this is an additional iteration through thisMap, plus dealing with IO, so should be much slower without + // turning on omit + } + + prevReduce.update(thisMap); // note that in "useBoth" cases, this method alters the thisMap object + } + + return prevReduce; + } + + public CoveragePartitioner treeReduce(CoveragePartitioner left, CoveragePartitioner right) { + left.merge(right); + return left; + } + + //////////////////////////////////////////////////////////////////////////////////// + // INTERVAL ON TRAVERSAL DONE + //////////////////////////////////////////////////////////////////////////////////// + + public void onTraversalDone( List> statsByInterval ) { + if ( refSeqGeneList != null && partitionTypes.contains(DoCOutputType.Partition.sample) ) { + printGeneStats(statsByInterval); + } + + if ( statsByInterval.size() > 0 ) { + for(DoCOutputType.Partition partition: partitionTypes) { + if ( checkType(statsByInterval.get(0).getSecond().getCoverageByAggregationType(partition) ,partition) ) { + printIntervalStats(statsByInterval, + getCorrectStream(partition, DoCOutputType.Aggregation.interval, DoCOutputType.FileType.summary), + getCorrectStream(partition, DoCOutputType.Aggregation.interval, DoCOutputType.FileType.statistics), + partition); + } else { + throw new ReviewedStingException("Partition type "+partition.toString()+" had no entries. Please check that your .bam header has all appropriate partition types."); + } + } + } else { + throw new UserException.CommandLineException("Cannot reduce by interval without a list of intervals. Please provide an interval list using the -L argument."); + } + + onTraversalDone(mergeAll(statsByInterval)); + + } + + public CoveragePartitioner mergeAll(List> stats) { + CoveragePartitioner first = stats.remove(0).second; + for ( Pair iStat : stats ) { + treeReduce(first,iStat.second); + } + + return first; + } + + private DepthOfCoverageStats printIntervalStats(List> statsByInterval, PrintStream summaryOut, PrintStream statsOut, DoCOutputType.Partition type) { + Pair firstPair = statsByInterval.get(0); + CoveragePartitioner firstAggregator = firstPair.second; + DepthOfCoverageStats firstStats = firstAggregator.getCoverageByAggregationType(type); + + StringBuilder summaryHeader = new StringBuilder(); + summaryHeader.append("Target"); + summaryHeader.append(separator); + summaryHeader.append("total_coverage"); + summaryHeader.append(separator); + summaryHeader.append("average_coverage"); + + for ( String s : firstStats.getAllSamples() ) { + summaryHeader.append(separator); + summaryHeader.append(s); + summaryHeader.append("_total_cvg"); + summaryHeader.append(separator); + summaryHeader.append(s); + summaryHeader.append("_mean_cvg"); + summaryHeader.append(separator); + summaryHeader.append(s); + summaryHeader.append("_granular_Q1"); + summaryHeader.append(separator); + summaryHeader.append(s); + summaryHeader.append("_granular_median"); + summaryHeader.append(separator); + summaryHeader.append(s); + summaryHeader.append("_granular_Q3"); + for ( int thresh : coverageThresholds ) { + summaryHeader.append(separator); + summaryHeader.append(s); + summaryHeader.append("_%_above_"); + summaryHeader.append(thresh); + } + } + + summaryOut.printf("%s%n",summaryHeader); + + int[][] nTargetsByAvgCvgBySample = new int[firstStats.getHistograms().size()][firstStats.getEndpoints().length+1]; + + for ( Pair targetAggregator : statsByInterval ) { + + Pair targetStats = new Pair( + targetAggregator.first, targetAggregator.second.getCoverageByAggregationType(type)); + printTargetSummary(summaryOut,targetStats); + updateTargetTable(nTargetsByAvgCvgBySample,targetStats.second); + } + + printIntervalTable(statsOut,nTargetsByAvgCvgBySample,firstStats.getEndpoints()); + + return firstStats; + } + + private void printGeneStats(List> statsByTarget) { + logger.debug("statsByTarget size is "+Integer.toString(statsByTarget.size())); + logger.debug("Initializing refseq..."); + LocationAwareSeekableRODIterator refseqIterator = initializeRefSeq(); + logger.debug("Refseq init done."); + List> statsByGene = new ArrayList>();// maintains order + Map geneNamesToStats = new HashMap(); // allows indirect updating of objects in list + + for ( Pair targetStats : statsByTarget ) { + String gene = getGeneName(targetStats.first,refseqIterator); + if ( geneNamesToStats.keySet().contains(gene) ) { + logger.debug("Merging "+geneNamesToStats.get(gene).toString()+" and "+targetStats.second.getCoverageByAggregationType(DoCOutputType.Partition.sample).toString()); + geneNamesToStats.get(gene).merge(targetStats.second.getCoverageByAggregationType(DoCOutputType.Partition.sample)); + } else { + DepthOfCoverageStats merger = new DepthOfCoverageStats(targetStats.second.getCoverageByAggregationType(DoCOutputType.Partition.sample)); + geneNamesToStats.put(gene,merger); + statsByGene.add(new Pair(gene,merger)); + } + } + + PrintStream geneSummaryOut = getCorrectStream(DoCOutputType.Partition.sample, DoCOutputType.Aggregation.gene, DoCOutputType.FileType.summary); + StringBuilder summaryHeader = new StringBuilder(); + summaryHeader.append("Gene"); + summaryHeader.append(separator); + summaryHeader.append("total_coverage"); + summaryHeader.append(separator); + summaryHeader.append("average_coverage"); + + for ( String s : statsByTarget.get(0).second.getCoverageByAggregationType(DoCOutputType.Partition.sample).getAllSamples() ) { + summaryHeader.append(separator); + summaryHeader.append(s); + summaryHeader.append("_total_cvg"); + summaryHeader.append(separator); + summaryHeader.append(s); + summaryHeader.append("_mean_cvg"); + summaryHeader.append(separator); + summaryHeader.append(s); + summaryHeader.append("_granular_Q1"); + summaryHeader.append(separator); + summaryHeader.append(s); + summaryHeader.append("_granular_median"); + summaryHeader.append(separator); + summaryHeader.append(s); + summaryHeader.append("_granular_Q3"); + for ( int thresh : coverageThresholds ) { + summaryHeader.append(separator); + summaryHeader.append(s); + summaryHeader.append("_%_above_"); + summaryHeader.append(thresh); + } + } + + geneSummaryOut.printf("%s%n",summaryHeader); + + for ( Pair geneStats : statsByGene ) { + printTargetSummary(geneSummaryOut,geneStats); + } + } + + //blatantly stolen from Andrew Kernytsky + private String getGeneName(GenomeLoc target, LocationAwareSeekableRODIterator refseqIterator) { + logger.debug("Examining "+target.toString()); + if (refseqIterator == null) { return "UNKNOWN"; } + + RODRecordList annotationList = refseqIterator.seekForward(target); + logger.debug("Annotation list is " + (annotationList == null ? "null" : annotationList.getName())); + if (annotationList == null) { return "UNKNOWN"; } + + for(GATKFeature rec : annotationList) { + if ( ((RefSeqFeature)rec.getUnderlyingObject()).overlapsExonP(target) ) { + logger.debug("We do overlap "+ rec.getUnderlyingObject().toString()); + return ((RefSeqFeature)rec.getUnderlyingObject()).getGeneName(); + } + logger.debug("No overlap"); + } + + return "UNKNOWN"; + + } + + private LocationAwareSeekableRODIterator initializeRefSeq() { + RMDTrackBuilder builder = new RMDTrackBuilder(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(), + getToolkit().getGenomeLocParser(), + getToolkit().getArguments().unsafe, + getToolkit().getArguments().disableAutoIndexCreationAndLockingWhenReadingRods); + RMDTrack refseq = builder.createInstanceOfTrack(RefSeqCodec.class,refSeqGeneList); + return new SeekableRODIterator(refseq.getHeader(),refseq.getSequenceDictionary(),getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(), + getToolkit().getGenomeLocParser(),refseq.getIterator()); + } + + private void printTargetSummary(PrintStream output, Pair intervalStats) { + DepthOfCoverageStats stats = intervalStats.second; + int[] bins = stats.getEndpoints(); + + StringBuilder targetSummary = new StringBuilder(); + targetSummary.append(intervalStats.first.toString()); + targetSummary.append(separator); + targetSummary.append(stats.getTotalCoverage()); + targetSummary.append(separator); + targetSummary.append(String.format("%.2f",stats.getTotalMeanCoverage())); + + for ( String s : stats.getAllSamples() ) { + targetSummary.append(separator); + targetSummary.append(stats.getTotals().get(s)); + targetSummary.append(separator); + targetSummary.append(String.format("%.2f", stats.getMeans().get(s))); + targetSummary.append(separator); + int median = getQuantile(stats.getHistograms().get(s),0.5); + int q1 = getQuantile(stats.getHistograms().get(s),0.25); + int q3 = getQuantile(stats.getHistograms().get(s),0.75); + targetSummary.append(formatBin(bins,q1)); + targetSummary.append(separator); + targetSummary.append(formatBin(bins,median)); + targetSummary.append(separator); + targetSummary.append(formatBin(bins,q3)); + for ( int thresh : coverageThresholds ) { + targetSummary.append(String.format("%s%.1f",separator,getPctBasesAbove(stats.getHistograms().get(s),stats.value2bin(thresh)))); + } + + } + + output.printf("%s%n", targetSummary); + } + + private String formatBin(int[] bins, int quartile) { + if ( quartile >= bins.length ) { + return String.format(">%d",bins[bins.length-1]); + } else if ( quartile < 0 ) { + return String.format("<%d",bins[0]); + } else { + return String.format("%d",bins[quartile]); + } + } + + private void printIntervalTable(PrintStream output, int[][] intervalTable, int[] cutoffs) { + String colHeader = outputFormat.equals("rtable") ? "" : "Number_of_sources"; + output.printf(colHeader + separator+"depth>=%d",0); + for ( int col = 0; col < intervalTable[0].length-1; col ++ ) { + output.printf(separator+"depth>=%d",cutoffs[col]); + } + + output.printf(String.format("%n")); + for ( int row = 0; row < intervalTable.length; row ++ ) { + output.printf("At_least_%d_samples",row+1); + for ( int col = 0; col < intervalTable[0].length; col++ ) { + output.printf(separator+"%d",intervalTable[row][col]); + } + output.printf(String.format("%n")); + } + } + + /* + * @updateTargetTable + * The idea is to have counts for how many *targets* have at least K samples with + * median coverage of at least X. + * To that end: + * Iterate over the samples the DOCS object, determine how many there are with + * median coverage > leftEnds[0]; how many with median coverage > leftEnds[1] + * and so on. Then this target has at least N, N-1, N-2, ... 1, 0 samples covered + * to leftEnds[0] and at least M,M-1,M-2,...1,0 samples covered to leftEnds[1] + * and so on. + */ + private void updateTargetTable(int[][] table, DepthOfCoverageStats stats) { + int[] cutoffs = stats.getEndpoints(); + int[] countsOfMediansAboveCutoffs = new int[cutoffs.length+1]; // 0 bin to catch everything + for ( int i = 0; i < countsOfMediansAboveCutoffs.length; i ++) { + countsOfMediansAboveCutoffs[i]=0; + } + + for ( String s : stats.getAllSamples() ) { + int medianBin = getQuantile(stats.getHistograms().get(s),0.5); + for ( int i = 0; i <= medianBin; i ++) { + countsOfMediansAboveCutoffs[i]++; + } + } + + for ( int medianBin = 0; medianBin < countsOfMediansAboveCutoffs.length; medianBin++) { + for ( ; countsOfMediansAboveCutoffs[medianBin] > 0; countsOfMediansAboveCutoffs[medianBin]-- ) { + table[countsOfMediansAboveCutoffs[medianBin]-1][medianBin]++; + // the -1 is due to counts being 1-based and offsets being 0-based + } + } + } + + //////////////////////////////////////////////////////////////////////////////////// + // FINAL ON TRAVERSAL DONE + //////////////////////////////////////////////////////////////////////////////////// + + public void onTraversalDone(CoveragePartitioner coverageProfiles) { + /////////////////// + // OPTIONAL OUTPUTS + ////////////////// + + if ( ! omitSampleSummary ) { + logger.info("Printing summary info"); + for (DoCOutputType.Partition type : partitionTypes ) { + outputSummaryFiles(coverageProfiles,type); + } + } + + if ( ! omitLocusTable ) { + logger.info("Printing locus summary"); + for (DoCOutputType.Partition type : partitionTypes ) { + outputLocusFiles(coverageProfiles,type); + } + } + } + + private void outputLocusFiles(CoveragePartitioner coverageProfiles, DoCOutputType.Partition type ) { + printPerLocus(getCorrectStream(type, DoCOutputType.Aggregation.cumulative, DoCOutputType.FileType.coverage_counts), + getCorrectStream(type, DoCOutputType.Aggregation.cumulative, DoCOutputType.FileType.coverage_proportions), + coverageProfiles.getCoverageByAggregationType(type),type); + } + + private void outputSummaryFiles(CoveragePartitioner coverageProfiles, DoCOutputType.Partition type ) { + printPerSample(getCorrectStream(type, DoCOutputType.Aggregation.cumulative, DoCOutputType.FileType.statistics),coverageProfiles.getCoverageByAggregationType(type)); + printSummary(getCorrectStream(type, DoCOutputType.Aggregation.cumulative, DoCOutputType.FileType.summary),coverageProfiles.getCoverageByAggregationType(type)); + } + + //////////////////////////////////////////////////////////////////////////////////// + // HELPER OUTPUT METHODS + //////////////////////////////////////////////////////////////////////////////////// + + private void printPerSample(PrintStream output,DepthOfCoverageStats stats) { + int[] leftEnds = stats.getEndpoints(); + + StringBuilder hBuilder = new StringBuilder(); + if ( ! outputFormat.equals("rTable")) { + hBuilder.append("Source_of_reads"); + } + hBuilder.append(separator); + hBuilder.append(String.format("from_0_to_%d)%s",leftEnds[0],separator)); + for ( int i = 1; i < leftEnds.length; i++ ) + hBuilder.append(String.format("from_%d_to_%d)%s",leftEnds[i-1],leftEnds[i],separator)); + hBuilder.append(String.format("from_%d_to_inf%n",leftEnds[leftEnds.length-1])); + output.print(hBuilder.toString()); + Map histograms = stats.getHistograms(); + + for ( Map.Entry p : histograms.entrySet() ) { + StringBuilder sBuilder = new StringBuilder(); + sBuilder.append(String.format("sample_%s",p.getKey())); + for ( long count : p.getValue() ) { + sBuilder.append(String.format("%s%d",separator,count)); + } + sBuilder.append(String.format("%n")); + output.print(sBuilder.toString()); + } + } + + private void printPerLocus(PrintStream output, PrintStream coverageOut, DepthOfCoverageStats stats, DoCOutputType.Partition partitionType) { + int[] endpoints = stats.getEndpoints(); + int samples = stats.getHistograms().size(); + + long[][] baseCoverageCumDist = stats.getLocusCounts(); + + // rows - # of samples + // columns - depth of coverage + + boolean printSampleColumnHeader = outputFormat.equals("csv") || outputFormat.equals("table"); + + StringBuilder header = new StringBuilder(); + if ( printSampleColumnHeader ) { + // mhanna 22 Aug 2010 - Deliberately force this header replacement to make sure integration tests pass. + // TODO: Update integration tests and get rid of this. + header.append(partitionType == DoCOutputType.Partition.readgroup ? "read_group" : partitionType.toString()); + } + header.append(String.format("%sgte_0",separator)); + for ( int d : endpoints ) { + header.append(String.format("%sgte_%d",separator,d)); + } + header.append(String.format("%n")); + + output.print(header); + coverageOut.print(header); + + for ( int row = 0; row < samples; row ++ ) { + output.printf("%s_%d","NSamples",row+1); + for ( int depthBin = 0; depthBin < baseCoverageCumDist[0].length; depthBin ++ ) { + output.printf("%s%d",separator,baseCoverageCumDist[row][depthBin]); + } + output.printf("%n"); + } + + for ( String sample : stats.getAllSamples() ) { + coverageOut.printf("%s",sample); + double[] coverageDistribution = stats.getCoverageProportions(sample); + for ( int bin = 0; bin < coverageDistribution.length; bin ++ ) { + coverageOut.printf("%s%.2f",separator,coverageDistribution[bin]); + } + coverageOut.printf("%n"); + } + } + + private PrintStream getCorrectStream(DoCOutputType.Partition partition, DoCOutputType.Aggregation aggregation, DoCOutputType.FileType fileType) { + DoCOutputType outputType = new DoCOutputType(partition,aggregation,fileType); + if(!out.containsKey(outputType)) + throw new UserException.CommandLineException(String.format("Unable to find appropriate stream for partition = %s, aggregation = %s, file type = %s",partition,aggregation,fileType)); + return out.get(outputType); + } + + private void printSummary(PrintStream output, DepthOfCoverageStats stats) { + if ( ! outputFormat.equals("csv") ) { + output.printf("%s\t%s\t%s\t%s\t%s\t%s","sample_id","total","mean","granular_third_quartile","granular_median","granular_first_quartile"); + } else { + output.printf("%s,%s,%s,%s,%s,%s","sample_id","total","mean","granular_third_quartile","granular_median","granular_first_quartile"); + } + + for ( int thresh : coverageThresholds ) { + output.printf("%s%s%d",separator,"%_bases_above_",thresh); + } + + output.printf("%n"); + + Map histograms = stats.getHistograms(); + Map means = stats.getMeans(); + Map totals = stats.getTotals(); + int[] leftEnds = stats.getEndpoints(); + + for ( Map.Entry p : histograms.entrySet() ) { + String s = p.getKey(); + long[] histogram = p.getValue(); + int median = getQuantile(histogram,0.5); + int q1 = getQuantile(histogram,0.25); + int q3 = getQuantile(histogram,0.75); + // if any of these are larger than the higest bin, put the median as in the largest bin + median = median == histogram.length-1 ? histogram.length-2 : median; + q1 = q1 == histogram.length-1 ? histogram.length-2 : q1; + q3 = q3 == histogram.length-1 ? histogram.length-2 : q3; + if ( ! outputFormat.equals("csv") ) { + output.printf("%s\t%d\t%.2f\t%d\t%d\t%d",s,totals.get(s),means.get(s),leftEnds[q3],leftEnds[median],leftEnds[q1]); + } else { + output.printf("%s,%d,%.2f,%d,%d,%d",s,totals.get(s),means.get(s),leftEnds[q3],leftEnds[median],leftEnds[q1]); + } + + for ( int thresh : coverageThresholds ) { + output.printf("%s%.1f",separator,getPctBasesAbove(histogram,stats.value2bin(thresh))); + } + + output.printf("%n"); + } + + if ( ! outputFormat.equals("csv") ) { + output.printf("%s\t%d\t%.2f\t%s\t%s\t%s%n","Total",stats.getTotalCoverage(),stats.getTotalMeanCoverage(),"N/A","N/A","N/A"); + } else { + output.printf("%s,%d,%.2f,%s,%s,%s%n","Total",stats.getTotalCoverage(),stats.getTotalMeanCoverage(),"N/A","N/A","N/A"); + } + } + + private int getQuantile(long[] histogram, double prop) { + int total = 0; + + for ( int i = 0; i < histogram.length; i ++ ) { + total += histogram[i]; + } + + int counts = 0; + int bin = -1; + while ( counts < prop*total ) { + counts += histogram[bin+1]; + bin++; + } + + return bin == -1 ? 0 : bin; + } + + private double getPctBasesAbove(long[] histogram, int bin) { + long below = 0l; + long above = 0l; + for ( int index = 0; index < histogram.length; index++) { + if ( index < bin ) { + below+=histogram[index]; + } else { + above+=histogram[index]; + } + } + + return 100*( (double) above )/( above + below ); + } + + private void printDepths(PrintStream stream, Map> countsBySampleByType, Map> identifiersByType) { + // get the depths per sample and build up the output string while tabulating total and average coverage + StringBuilder perSampleOutput = new StringBuilder(); + int tDepth = 0; + boolean depthCounted = false; + for (DoCOutputType.Partition type : partitionTypes ) { + Map countsByID = countsBySampleByType.get(type); + for ( String s : identifiersByType.get(type) ) { + perSampleOutput.append(separator); + long dp = (countsByID != null && countsByID.keySet().contains(s)) ? sumArray(countsByID.get(s)) : 0 ; + perSampleOutput.append(dp); + if ( printBaseCounts ) { + perSampleOutput.append(separator); + perSampleOutput.append(baseCounts(countsByID != null ? countsByID.get(s) : null )); + } + if ( ! depthCounted ) { + tDepth += dp; + } + } + depthCounted = true; // only sum the total depth once + } + + // remember -- genome locus was printed in map() + stream.printf("%s%d",separator,tDepth); + for (DoCOutputType.Partition type : partitionTypes ) { + stream.printf("%s%.2f",separator, ( (double) tDepth / identifiersByType.get(type).size() ) ); + } + stream.printf("%s%n",perSampleOutput); + } + + private long sumArray(int[] array) { + long i = 0; + for ( int j : array ) { + i += j; + } + return i; + } + + private String baseCounts(int[] counts) { + if ( counts == null ) { + counts = new int[6]; + } + StringBuilder s = new StringBuilder(); + int nbases = 0; + for ( byte b : BaseUtils.EXTENDED_BASES ) { + nbases++; + if ( includeDeletions || b != BaseUtils.Base.D.base ) { + s.append((char)b); + s.append(":"); + s.append(counts[BaseUtils.extendedBaseToBaseIndex(b)]); + if ( nbases < 6 ) { + s.append(" "); + } + } + } + + return s.toString(); + } + + private void checkOrder(CoveragePartitioner ag) { + // make sure the ordering stored at initialize() is propagated along reduce + for (DoCOutputType.Partition t : partitionTypes ) { + List order = orderCheck.get(t); + List namesInAg = ag.getIdentifiersByType().get(t); + + // todo -- chris check me + Set namesInDOCS = ag.getCoverageByAggregationType(t).getAllSamples(); + int index = 0; + for ( String s : namesInAg ) { + if ( ! s.equalsIgnoreCase(order.get(index)) ) { + throw new ReviewedStingException("IDs are out of order for type "+t+"! Aggregator has different ordering"); + } + index++; + } + } + } + + public boolean checkType(DepthOfCoverageStats stats, DoCOutputType.Partition type ) { + if ( stats.getHistograms().size() < 1 ) { + logger.warn("The histogram per partition type "+type.toString()+" was empty\n"+ + "Do your read groups have this type? (Check your .bam header)."); + return false; + } else { + return true; + } + } + +} + +class DoCOutputMultiplexer implements Multiplexer { + private final Set partitions; + private final File refSeqGeneList; + private final boolean omitDepthOutput; + private final boolean omitIntervals; + private final boolean omitSampleSummary; + private final boolean omitLocusTable; + + /** + * Create a new multiplexer type using the values of all variable fields. + * @param partitions + * @param refSeqGeneList + * @param omitDepthOutput + * @param omitIntervals + * @param omitSampleSummary + * @param omitLocusTable + */ + public DoCOutputMultiplexer(final Set partitions, + final File refSeqGeneList, + final boolean omitDepthOutput, + final boolean omitIntervals, + final boolean omitSampleSummary, + final boolean omitLocusTable) { + this.partitions = partitions; + this.refSeqGeneList = refSeqGeneList; + this.omitDepthOutput = omitDepthOutput; + this.omitIntervals = omitIntervals; + this.omitSampleSummary = omitSampleSummary; + this.omitLocusTable = omitLocusTable; + } + + public Collection multiplex() { + List outputs = new ArrayList(); + if(!omitDepthOutput) outputs.add(new DoCOutputType(null, DoCOutputType.Aggregation.locus, DoCOutputType.FileType.summary)); + + if(!omitIntervals) { + for(DoCOutputType.Partition partition: partitions) { + outputs.add(new DoCOutputType(partition, DoCOutputType.Aggregation.interval, DoCOutputType.FileType.summary)); + outputs.add(new DoCOutputType(partition, DoCOutputType.Aggregation.interval, DoCOutputType.FileType.statistics)); + } + } + + if(refSeqGeneList != null && partitions.contains(DoCOutputType.Partition.sample)) { + DoCOutputType geneSummaryOut = new DoCOutputType(DoCOutputType.Partition.sample, DoCOutputType.Aggregation.gene, DoCOutputType.FileType.summary); + outputs.add(geneSummaryOut); + } + + if(!omitSampleSummary) { + for(DoCOutputType.Partition partition: partitions) { + outputs.add(new DoCOutputType(partition, DoCOutputType.Aggregation.cumulative, DoCOutputType.FileType.summary)); + outputs.add(new DoCOutputType(partition, DoCOutputType.Aggregation.cumulative, DoCOutputType.FileType.statistics)); + } + } + + if(!omitLocusTable) { + for(DoCOutputType.Partition partition: partitions) { + outputs.add(new DoCOutputType(partition, DoCOutputType.Aggregation.cumulative, DoCOutputType.FileType.coverage_counts)); + outputs.add(new DoCOutputType(partition, DoCOutputType.Aggregation.cumulative, DoCOutputType.FileType.coverage_proportions)); + } + } + + return outputs; + } + + public String transformArgument(final DoCOutputType outputType, final String argument) { + return outputType.getFileName(argument); + } + +} + +class CoveragePartitioner { + private Collection types; + private Map coverageProfiles; + private Map> identifiersByType; + private Set allIdentifiers; + public CoveragePartitioner(Collection typesToUse, int start, int stop, int nBins) { + coverageProfiles = new HashMap(); + identifiersByType = new HashMap>(); + types = typesToUse; + for ( DoCOutputType.Partition type : types ) { + coverageProfiles.put(type,new DepthOfCoverageStats(DepthOfCoverageStats.calculateBinEndpoints(start,stop,nBins))); + identifiersByType.put(type,new ArrayList()); + } + allIdentifiers = new HashSet(); + } + + public void merge(CoveragePartitioner otherAggregator) { + for ( DoCOutputType.Partition type : types ) { + this.coverageProfiles.get(type).merge(otherAggregator.coverageProfiles.get(type)); + } + } + + public DepthOfCoverageStats getCoverageByAggregationType(DoCOutputType.Partition t) { + return coverageProfiles.get(t); + } + + public void addIdentifiers(DoCOutputType.Partition t, Set ids) { + for ( String s : ids ) { + coverageProfiles.get(t).addSample(s); + identifiersByType.get(t).add(s); + allIdentifiers.add(s); + } + Collections.sort(identifiersByType.get(t)); + } + + public void initialize(boolean useDels, boolean omitLocusTable) { + for ( DoCOutputType.Partition t : types ) { + if ( useDels ) { + coverageProfiles.get(t).initializeDeletions(); + } + if ( ! omitLocusTable ) { + coverageProfiles.get(t).initializeLocusCounts(); + } + } + } + + public void update(Map> countsByIdentifierByType) { + for ( DoCOutputType.Partition t : types ) { + coverageProfiles.get(t).update(countsByIdentifierByType.get(t)); + } + } + + public Set getAllIdentifiers() { + return allIdentifiers; + } + + public Map> getIdentifiersByType() { + return identifiersByType; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageStats.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageStats.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageStats.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageStats.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DoCOutputType.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/coverage/DoCOutputType.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DoCOutputType.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/coverage/DoCOutputType.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByInterval.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByInterval.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByInterval.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByInterval.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/CoveredByNSamplesSites.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/CoveredByNSamplesSites.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/CoveredByNSamplesSites.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/CoveredByNSamplesSites.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/BAMDiffableReader.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diffengine/BAMDiffableReader.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/BAMDiffableReader.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diffengine/BAMDiffableReader.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffElement.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diffengine/DiffElement.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffElement.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diffengine/DiffElement.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffNode.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diffengine/DiffNode.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffNode.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diffengine/DiffNode.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjects.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjects.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjects.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjects.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffValue.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diffengine/DiffValue.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffValue.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diffengine/DiffValue.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReader.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReader.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReader.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReader.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/Difference.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diffengine/Difference.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/Difference.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diffengine/Difference.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/GATKReportDiffableReader.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diffengine/GATKReportDiffableReader.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/GATKReportDiffableReader.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diffengine/GATKReportDiffableReader.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceMaker.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceMaker.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceMaker.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceMaker.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaSequence.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/fasta/FastaSequence.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaSequence.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/fasta/FastaSequence.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaStats.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/fasta/FastaStats.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaStats.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/fasta/FastaStats.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/ClusteredSnps.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/filters/ClusteredSnps.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/filters/ClusteredSnps.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/filters/ClusteredSnps.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/FiltrationContext.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/filters/FiltrationContext.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/filters/FiltrationContext.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/filters/FiltrationContext.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/FiltrationContextWindow.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/filters/FiltrationContextWindow.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/filters/FiltrationContextWindow.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/filters/FiltrationContextWindow.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltration.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltration.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltration.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltration.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HCMappingQualityFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HCMappingQualityFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HCMappingQualityFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HCMappingQualityFilter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/package-info.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/package-info.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/package-info.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/package-info.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CheckPileup.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CheckPileup.java new file mode 100644 index 000000000..b6a3853f8 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CheckPileup.java @@ -0,0 +1,258 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.walkers.qc; + +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Input; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.codecs.sampileup.SAMPileupFeature; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; + +import java.io.PrintStream; +import java.util.Arrays; + +/** + * Compare GATK's internal pileup to a reference Samtools pileup + * + *

At every locus in the input set, compares the pileup data (reference base, aligned base from + * each overlapping read, and quality score) generated internally by GATK to a reference pileup data generated + * by Samtools. Note that the pileup program has been replaced in Samtools by mpileup, which produces a slightly + * different output format by default. + *

+ * + *

Format

+ *

There are two versions of the original pileup format: the current 6-column format produced by Samtools, and the old + * 10-column "consensus" format which could be obtained by using the -c argument, now deprecated.

+ *

Simple pileup: 6-column format

+ *

+ * Each line consists of chromosome, 1-based coordinate, reference base, the + * number of reads covering the site, read bases and base qualities. At the + * read base column, a dot stands for a match to the reference base on the + * forward strand, a comma for a match on the reverse strand, `ACGTN' for a mismatch + * on the forward strand and `acgtn' for a mismatch on the reverse strand. + * A pattern `\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion between + * this reference position and the next reference position. The length of the + * insertion is given by the integer in the pattern, followed by the inserted sequence. + *

+ *
+ *     seq1 272 T 24  ,.$.....,,.,.,...,,,.,..^+. <<<+;<<<<<<<<<<<=<;<;7<&
+ *     seq1 273 T 23  ,.....,,.,.,...,,,.,..A <<<;<<<<<<<<<3<=<<<;<<+
+ *     seq1 274 T 23  ,.$....,,.,.,...,,,.,...    7<7;<;<<<<<<<<<=<;<;<<6
+ *     seq1 275 A 23  ,$....,,.,.,...,,,.,...^l.  <+;9*<<<<<<<<<=<<:;<<<<
+ *     seq1 276 G 22  ...T,,.,.,...,,,.,....  33;+<<7=7<<7<&<<1;<<6<
+ *     seq1 277 T 22  ....,,.,.,.C.,,,.,..G.  +7<;<<<<<<<&<=<<:;<<&<
+ *     seq1 278 G 23  ....,,.,.,...,,,.,....^k.   %38*<<;<7<<7<=<<<;<<<<<
+ *     seq1 279 C 23  A..T,,.,.,...,,,.,..... ;75&<<<<<<<<<=<<<9<<:<<
+ * 
+ *

+ * See the Pileup format documentation for more details. + *

+ * + *

Consensus pileup: 10/13-column format

+ *

The "consensus" or extended pileup consists of the following: + *

    + *
  • original 6 columns as described above
  • + *
  • 4 extra columns representing consensus values (consensus base, consensus quality, variant quality and maximum mapping quality of the + * reads covering the sites) for all sites, inserted before the bases and quality strings
  • + *
  • 3 extra columns indicating counts of reads supporting indels (just for indel sites)
  • + *
+ *

+ *

Example of consensus pileup for SNP or non-variant sites

+ *
+ *     seq1  60  T  T  66  0  99  13  ...........^~.^~.   9<<55<;<<<<<<
+ *     seq1  61  G  G  72  0  99  15  .............^~.^y. (;975&;<<<<<<<<
+ *     seq1  62  T  T  72  0  99  15  .$..............    <;;,55;<<<<<<<<
+ *     seq1  63  G  G  72  0  99  15  .$.............^~.  4;2;<7:+<<<<<<<
+ *     seq1  64  G  G  69  0  99  14  ..............  9+5<;;;<<<<<<<
+ *     seq1  65  A  A  69  0  99  14  .$............. <5-2<;;<<<<<<;
+ *     seq1  66  C  C  66  0  99  13  .............   &*<;;<<<<<<8<
+ *     seq1  67  C  C  69  0  99  14  .............^~.    ,75<.4<<<<<-<<
+ *     seq1  68  C  C  69  0  99  14  ..............  576<;7<<<<<8<< *
+ * 
+ * + *

Example of consensus pileup for indels

+ *
+ *     Escherichia_coli_K12	3995037	*	*\/*	430	0	37	144	*	+A	143	1	0
+ *     Escherichia_coli_K12	3995279	*	*\/*	202	0	36	68	*	+A	67	1	0
+ *     Escherichia_coli_K12	3995281	*	*\/*	239	0	36	67	*	-CG	66	1	0
+ * 
+ *

+ * See Consensus pileup format (deprecated) for more details. + *

+ * + *

Input

+ *

A BAM file conatining your aligned sequence data and a pileup file generated by Samtools covering the region you + * want to examine.

+ * + *

Output

+ *

A text file listing mismatches between the input pileup and the GATK's internal pileup. If there are no mismatches, the output file is empty.

+ * + *

Example

+ *
+ * java -jar GenomeAnalysisTK.jar \
+ *   -T CheckPileup \
+ *   -R ref.fasta \
+ *   -I your_data.bam \
+ *   --pileup:SAMPileup pileup_file.txt \
+ *   -L chr1:257-275 \
+ *   -o output_file_name
+ * 
+ */ +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) +@Requires(value={DataSource.READS,DataSource.REFERENCE}) +public class CheckPileup extends LocusWalker implements TreeReducible { + /** + * This is the existing pileup against which we'll compare GATK's internal pileup at each genome position in the desired interval. + */ + @Input(fullName = "pileup", shortName = "pileup", doc="Pileup generated by Samtools", required = true) + RodBinding pileup; + + @Output + private PrintStream out; + /** + * By default the program will quit if it encounters an error (such as missing truth data for a given position). + * Use this flag to override the default behavior; the program will then simply print an error message and move on + * to the next position. + */ + @Argument(fullName="continue_after_error",doc="Continue after encountering an error",required=false) + public boolean CONTINUE_AFTER_AN_ERROR = false; + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + ReadBackedPileup pileup = context.getBasePileup(); + SAMPileupFeature truePileup = getTruePileup( tracker ); + + if ( truePileup == null ) { + out.printf("No truth pileup data available at %s%n", pileup.getPileupString(ref.getBaseAsChar())); + if ( ! CONTINUE_AFTER_AN_ERROR ) { + throw new UserException.BadInput(String.format("No pileup data available at %s given GATK's output of %s -- this walker requires samtools pileup data over all bases", + context.getLocation(), new String(pileup.getBases()))); + } + } else { + String pileupDiff = pileupDiff(pileup, truePileup, true); + if ( pileupDiff != null ) { + out.printf("%s vs. %s%n", pileup.getPileupString(ref.getBaseAsChar()), truePileup.getPileupString()); + if ( ! CONTINUE_AFTER_AN_ERROR ) { + throw new UserException.BadInput(String.format("The input pileup doesn't match the GATK's internal pileup: %s", pileupDiff)); + } + } + } + + return pileup.getNumberOfElements(); + } + + private static String maybeSorted( final String x, boolean sortMe ) + { + if ( sortMe ) { + byte[] bytes = x.getBytes(); + Arrays.sort(bytes); + return new String(bytes); + } + else + return x; + } + + public String pileupDiff(final ReadBackedPileup a, final SAMPileupFeature b, boolean orderDependent) + { + if ( a.getNumberOfElements() != b.size() ) + return "Sizes not equal"; + GenomeLoc featureLocation = getToolkit().getGenomeLocParser().createGenomeLoc(b.getChr(),b.getStart(),b.getEnd()); + if ( a.getLocation().compareTo(featureLocation) != 0 ) + return "Locations not equal"; + + String aBases = maybeSorted(new String(a.getBases()), ! orderDependent ); + String bBases = maybeSorted(b.getBasesAsString(), ! orderDependent ); + if ( ! aBases.toUpperCase().equals(bBases.toUpperCase()) ) + return "Bases not equal"; + + String aQuals = maybeSorted(new String(a.getQuals()), ! orderDependent ); + String bQuals = maybeSorted(new String(b.getQuals()), ! orderDependent ); + if ( ! aQuals.equals(bQuals) ) + return "Quals not equal"; + + return null; + } + + // Given result of map function + public CheckPileupStats reduceInit() { return new CheckPileupStats(); } + public CheckPileupStats reduce(Integer value, CheckPileupStats sum) { + sum.nLoci++; + sum.nBases += value; + return sum; + } + + public CheckPileupStats treeReduce( CheckPileupStats lhs, CheckPileupStats rhs ) { + CheckPileupStats combined = new CheckPileupStats(); + combined.nLoci = lhs.nLoci + rhs.nLoci; + combined.nBases = lhs.nBases + rhs.nBases; + return combined; + } + + /** + * Extracts the true pileup data from the given rodSAMPileup. Note that this implementation + * assumes that the genotype will only be point or indel. + * @param tracker ROD tracker from which to extract pileup data. + * @return True pileup data. + */ + private SAMPileupFeature getTruePileup( RefMetaDataTracker tracker ) { + SAMPileupFeature pileupArg = tracker.getFirstValue(pileup); + + if( pileupArg == null) + return null; + + if( pileupArg.hasPointGenotype() ) + return pileupArg.getPointGenotype(); + else if( pileupArg.hasIndelGenotype() ) + return pileupArg.getIndelGenotype(); + else + throw new ReviewedStingException("Unsupported pileup type: " + pileupArg); + } +} + +class CheckPileupStats { + public long nLoci = 0; + public long nBases = 0; + + public CheckPileupStats() { + } + + public String toString() { + return String.format("Validated %d sites covered by %d bases%n", nLoci, nBases); + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountBases.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CountBases.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountBases.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CountBases.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLoci.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CountLoci.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLoci.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CountLoci.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMales.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CountMales.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMales.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CountMales.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsByRef.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CountRODsByRef.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsByRef.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CountRODsByRef.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadEvents.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CountReadEvents.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadEvents.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CountReadEvents.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEvent.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEvent.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEvent.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEvent.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/DocumentationTest.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/DocumentationTest.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/qc/DocumentationTest.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/DocumentationTest.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/ErrorThrowing.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/ErrorThrowing.java new file mode 100644 index 000000000..8e99c1828 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/ErrorThrowing.java @@ -0,0 +1,111 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.walkers.qc; + +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Hidden; +import org.broadinstitute.sting.commandline.Input; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.NanoSchedulable; +import org.broadinstitute.sting.gatk.walkers.RefWalker; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; + +/** + * A walker that simply throws errors. Allows us to test that the engine is behaving as expected with error handling + */ +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_TOY, extraDocs = {CommandLineGATK.class} ) +public class ErrorThrowing extends RefWalker implements TreeReducible, NanoSchedulable { + @Input(fullName="exception", shortName = "E", doc="Java class of exception to throw", required=true) + public String exceptionToThrow; + + @Argument(fullName = "failMethod", shortName = "fail", doc = "Determines which method to fail in", required = false) + public FailMethod failMethod = FailMethod.MAP; + + public enum FailMethod { + MAP, + REDUCE, + TREE_REDUCE + } + + // + // Template code to allow us to build the walker, doesn't actually do anything + // + @Override + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( ref == null ) // only throw exception when we are in proper map, not special map(null) call + return null; + + if ( failMethod == FailMethod.MAP ) + fail(); + + return 0; + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(Integer value, Integer sum) { + if ( value != null && failMethod == FailMethod.REDUCE ) + fail(); + return sum; + } + + public Integer treeReduce(final Integer lhs, final Integer rhs) { + if ( failMethod == FailMethod.TREE_REDUCE ) + fail(); + return rhs; + } + + private void fail() { + if ( exceptionToThrow.equals("UserException") ) { + throw new UserException("UserException"); + } else if ( exceptionToThrow.equals("NullPointerException") ) { + throw new NullPointerException(); + } else if ( exceptionToThrow.equals("ReviewedStingException") ) { + throw new ReviewedStingException("ReviewedStingException"); + } else if ( exceptionToThrow.equals("SamError1") ) { + throw new RuntimeException(CommandLineGATK.PICARD_TEXT_SAM_FILE_ERROR_1); + } else if ( exceptionToThrow.equals("SamError2") ) { + throw new RuntimeException(CommandLineGATK.PICARD_TEXT_SAM_FILE_ERROR_2); + } else if ( exceptionToThrow.equals("NoSpace1") ) { + throw new net.sf.samtools.util.RuntimeIOException(new java.io.IOException("No space left on device java.io.FileOutputStream.writeBytes(Native Method)")); + } else if ( exceptionToThrow.equals("NoSpace2") ) { + throw new net.sf.samtools.SAMException("Exception writing BAM index file", new java.io.IOException("No space left on device java.io.FileOutputStream.writeBytes(Native Method)")); + } else { + throw new UserException.BadArgumentValue("exception", "exception isn't a recognized value " + exceptionToThrow); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/FlagStat.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/FlagStat.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/qc/FlagStat.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/FlagStat.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/Pileup.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/Pileup.java new file mode 100644 index 000000000..48e21fdd0 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/Pileup.java @@ -0,0 +1,217 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.walkers.qc; + +import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.NanoSchedulable; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; + +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + * Emulates the samtools pileup command to print aligned reads + * + *

Prints the alignment in something similar to the Samtools pileup format (see the + * Pileup format documentation for more details about + * the original format). There is one line per genomic position, listing the chromosome name, coordinate, reference + * base, read bases, and read qualities. In addition to these default fields, additional information can be added to + * the output as extra columns; see options detailed below.

+ * + *

Emulated command:

+ *
+ *  samtools pileup -f in.ref.fasta -l in.site_list input.bam
+ * 
+ + * + *

Input

+ *

+ * A BAM file and the interval to print. + *

+ * + *

Output

+ *

+ * Alignment of reads formatted in the Pileup style. + *

+ * + *

Example

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -T Pileup \
+ *   -R exampleFASTA.fasta \
+ *   -I exampleBAM.bam \
+ *   -L chr1:257-267
+ *   -o output.txt
+ * 
+ *

Expected output

+ *
+ *     chr1 257 A CAA '&=
+ *     chr1 258 C TCC A:=
+ *     chr1 259 C CCC )A=
+ *     chr1 260 C ACC (=<
+ *     chr1 261 T TCT '44
+ *     chr1 262 A AAA '?:
+ *     chr1 263 A AGA 1'6
+ *     chr1 264 C TCC 987
+ *     chr1 265 C CCC (@(
+ *     chr1 266 C GCC ''=
+ *     chr1 267 T AAT 7%>
+ * 
+ * + */ +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) +public class Pileup extends LocusWalker implements TreeReducible, NanoSchedulable { + + private static final String verboseDelimiter = "@"; // it's ugly to use "@" but it's literally the only usable character not allowed in read names + + @Output + PrintStream out; + + /** + * In addition to the standard pileup output, adds 'verbose' output too. The verbose output contains the number of spanning deletions, + * and for each read in the pileup it has the read name, offset in the base string, read length, and read mapping quality. These per + * read items are delimited with an '@' character. + */ + @Argument(fullName="showVerbose",shortName="verbose",doc="Add an extra verbose section to the pileup output", required=false) + public boolean SHOW_VERBOSE = false; + /** + * This enables annotating the pileup to show overlaps with metadata from a ROD file. + * For example, if you provide a VCF and there is a SNP at a given location covered by the pileup, the pileup + * output at that position will be annotated with the corresponding source ROD identifier. + */ + @Input(fullName="metadata",shortName="metadata",doc="ROD file containing metadata", required=false) + public List> rods = Collections.emptyList(); + /** + * Adds the length of the insert each base comes from to the output pileup. Here, "insert" refers to the DNA insert + * produced during library generation before sequencing. + */ + @Hidden + @Argument(fullName="outputInsertLength",shortName = "outputInsertLength",doc="Output insert length",required=false) + public boolean outputInsertLength=false; + + @Override + public String map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + final String rods = getReferenceOrderedData( tracker ); + + ReadBackedPileup basePileup = context.getBasePileup(); + + final StringBuilder s = new StringBuilder(); + s.append(String.format("%s %s", basePileup.getPileupString((char)ref.getBase()), rods)); + if ( outputInsertLength ) + s.append(" ").append(insertLengthOutput(basePileup)); + if ( SHOW_VERBOSE ) + s.append(" ").append(createVerboseOutput(basePileup)); + s.append("\n"); + + return s.toString(); + } + + // Given result of map function + @Override + public Integer reduceInit() { return 0; } + + @Override + public Integer reduce(String value, Integer sum) { + out.print(value); + return sum + 1; + } + + @Override + public Integer treeReduce(Integer lhs, Integer rhs) { + return lhs + rhs; + } + + /** + * Get a string representation the reference-ordered data. + * @param tracker Container for the reference-ordered data. + * @return String representation of the reference-ordered data. + */ + private String getReferenceOrderedData( RefMetaDataTracker tracker ) { + ArrayList rodStrings = new ArrayList(); + for ( Feature datum : tracker.getValues(rods) ) { + rodStrings.add(datum.toString()); + } + String rodString = Utils.join(", ", rodStrings); + + if ( !rodString.equals("") ) + rodString = "[ROD: " + rodString + "]"; + + return rodString; + } + private static String insertLengthOutput(final ReadBackedPileup pileup) { + + Integer[] insertSizes=new Integer[pileup.depthOfCoverage()]; + + int i=0; + for ( PileupElement p : pileup ) { + insertSizes[i]=p.getRead().getInferredInsertSize(); + ++i; + } + return Utils.join(",",insertSizes); + } + + + private static String createVerboseOutput(final ReadBackedPileup pileup) { + final StringBuilder sb = new StringBuilder(); + boolean isFirst = true; + + sb.append(pileup.getNumberOfDeletions()); + sb.append(" "); + + for ( PileupElement p : pileup ) { + if ( isFirst ) + isFirst = false; + else + sb.append(","); + sb.append(p.getRead().getReadName()); + sb.append(verboseDelimiter); + sb.append(p.getOffset()); + sb.append(verboseDelimiter); + sb.append(p.getRead().getReadLength()); + sb.append(verboseDelimiter); + sb.append(p.getRead().getMappingQuality()); + } + return sb.toString(); + } + + @Override + public void onTraversalDone(Integer result) { + out.println("[REDUCE RESULT] Traversal result is: " + result); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/PrintRODs.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/PrintRODs.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/qc/PrintRODs.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/PrintRODs.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/QCRef.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/QCRef.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/qc/QCRef.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/QCRef.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ReadClippingStats.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/ReadClippingStats.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ReadClippingStats.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/ReadClippingStats.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/RodSystemValidation.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/RodSystemValidation.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/qc/RodSystemValidation.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/RodSystemValidation.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ClipReads.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/readutils/ClipReads.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ClipReads.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/readutils/ClipReads.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/PrintReads.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/readutils/PrintReads.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/PrintReads.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/readutils/PrintReads.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ReadAdaptorTrimmer.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/readutils/ReadAdaptorTrimmer.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ReadAdaptorTrimmer.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/readutils/ReadAdaptorTrimmer.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/SplitSamFile.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/readutils/SplitSamFile.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/SplitSamFile.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/readutils/SplitSamFile.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MendelianViolationEvaluator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MendelianViolationEvaluator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MendelianViolationEvaluator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MendelianViolationEvaluator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MultiallelicSummary.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MultiallelicSummary.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MultiallelicSummary.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MultiallelicSummary.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/PrintMissingComp.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/PrintMissingComp.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/PrintMissingComp.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/PrintMissingComp.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/StandardEval.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/StandardEval.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/StandardEval.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/StandardEval.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ThetaVariantEvaluator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ThetaVariantEvaluator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ThetaVariantEvaluator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ThetaVariantEvaluator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CompRod.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CompRod.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CompRod.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CompRod.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Contig.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Contig.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Contig.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Contig.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CpG.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CpG.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CpG.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CpG.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/DynamicStratification.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/DynamicStratification.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/DynamicStratification.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/DynamicStratification.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Filter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Filter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Filter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Filter.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IndelSize.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IndelSize.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IndelSize.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IndelSize.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/JexlExpression.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/JexlExpression.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/JexlExpression.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/JexlExpression.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Novelty.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Novelty.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Novelty.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Novelty.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/OneBPIndel.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/OneBPIndel.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/OneBPIndel.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/OneBPIndel.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/RequiredStratification.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/RequiredStratification.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/RequiredStratification.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/RequiredStratification.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Sample.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Sample.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Sample.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Sample.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/SnpEffPositionModifier.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/SnpEffPositionModifier.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/SnpEffPositionModifier.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/SnpEffPositionModifier.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/StandardStratification.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/StandardStratification.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/StandardStratification.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/StandardStratification.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/TandemRepeat.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/TandemRepeat.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/TandemRepeat.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/TandemRepeat.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantStratifier.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantStratifier.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantStratifier.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantStratifier.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantType.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantType.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantType.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantType.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratNode.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratNode.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratNode.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratNode.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratNodeIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratNodeIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratNodeIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratNodeIterator.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManager.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManager.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManager.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManager.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/Stratifier.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/Stratifier.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/Stratifier.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/Stratifier.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/Analysis.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/util/Analysis.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/Analysis.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/util/Analysis.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/AnalysisModuleScanner.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/util/AnalysisModuleScanner.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/AnalysisModuleScanner.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/util/AnalysisModuleScanner.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/DataPoint.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/util/DataPoint.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/DataPoint.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/util/DataPoint.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/EvaluationContext.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/util/EvaluationContext.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/EvaluationContext.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/util/EvaluationContext.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/Molten.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/util/Molten.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/Molten.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/util/Molten.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/SortableJexlVCMatchExp.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/util/SortableJexlVCMatchExp.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/SortableJexlVCMatchExp.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/util/SortableJexlVCMatchExp.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java new file mode 100644 index 000000000..152128022 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -0,0 +1,357 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.Reference; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.gatk.walkers.Window; +import org.broadinstitute.sting.gatk.walkers.annotator.ChromosomeCountConstants; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.vcf.*; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.VariantContextBuilder; +import org.broadinstitute.variant.variantcontext.VariantContextUtils; +import org.broadinstitute.variant.variantcontext.writer.Options; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; + +import java.util.*; + +/** + * Combines VCF records from different sources. + * + *

+ * CombineVariants combines VCF records from different sources. Any (unique) name can be used to bind your rod data + * and any number of sources can be input. This tool currently supports two different combination types for each of + * variants (the first 8 fields of the VCF) and genotypes (the rest). + * Merge: combines multiple records into a single one; if sample names overlap then they are uniquified. + * Union: assumes each rod represents the same set of samples (although this is not enforced); using the + * priority list (if provided), it emits a single record instance at every position represented in the rods. + * + * CombineVariants will include a record at every site in all of your input VCF files, and annotate which input ROD + * bindings the record is present, pass, or filtered in in the set attribute in the INFO field. In effect, + * CombineVariants always produces a union of the input VCFs. However, any part of the Venn of the N merged VCFs + * can be exacted using JEXL expressions on the set attribute using SelectVariants. If you want to extract just + * the records in common between two VCFs, you would first run CombineVariants on the two files to generate a single + * VCF and then run SelectVariants to extract the common records with -select 'set == "Intersection"', as worked out + * in the detailed example in the documentation guide. + * + * Note that CombineVariants supports multi-threaded parallelism (8/15/12). This is particularly useful + * when converting from VCF to BCF2, which can be expensive. In this case each thread spends CPU time + * doing the conversion, and the GATK engine is smart enough to merge the partial BCF2 blocks together + * efficiency. However, since this merge runs in only one thread, you can quickly reach diminishing + * returns with the number of parallel threads. -nt 4 works well but -nt 8 may be too much. + * + * Some fine details about the merging algorithm: + *

    + *
  • As of GATK 2.1, when merging multiple VCF records at a site, the combined VCF record has the QUAL of + * the first VCF record with a non-MISSING QUAL value. The previous behavior was to take the + * max QUAL, which resulted in sometime strange downstream confusion
  • + *
+ * + *

Input

+ *

+ * One or more variant sets to combine. + *

+ * + *

Output

+ *

+ * A combined VCF. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T CombineVariants \
+ *   --variant input1.vcf \
+ *   --variant input2.vcf \
+ *   -o output.vcf \
+ *   -genotypeMergeOptions UNIQUIFY
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T CombineVariants \
+ *   --variant:foo input1.vcf \
+ *   --variant:bar input2.vcf \
+ *   -o output.vcf \
+ *   -genotypeMergeOptions PRIORITIZE
+ *   -priority foo,bar
+ * 
+ * + */ +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) +@Reference(window=@Window(start=-50,stop=50)) +public class CombineVariants extends RodWalker implements TreeReducible { + /** + * The VCF files to merge together + * + * variants can take any number of arguments on the command line. Each -V argument + * will be included in the final merged output VCF. If no explicit name is provided, + * the -V arguments will be named using the default algorithm: variants, variants2, variants3, etc. + * The user can override this by providing an explicit name -V:name,vcf for each -V argument, + * and each named argument will be labeled as such in the output (i.e., set=name rather than + * set=variants2). The order of arguments does not matter unless except for the naming, so + * if you provide an rod priority list and no explicit names than variants, variants2, etc + * are technically order dependent. It is strongly recommended to provide explicit names when + * a rod priority list is provided. + */ + @Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true) + public List> variantCollections; + final private List> variants = new ArrayList<>(); + + @Output(doc="File to which variants should be written") + protected VariantContextWriter vcfWriter = null; + + @Argument(shortName="genotypeMergeOptions", doc="Determines how we should merge genotype records for samples shared across the ROD files", required=false) + public GATKVariantContextUtils.GenotypeMergeType genotypeMergeOption = null; + + @Argument(shortName="filteredRecordsMergeType", doc="Determines how we should handle records seen at the same site in the VCF, but with different FILTER fields", required=false) + public GATKVariantContextUtils.FilteredRecordMergeType filteredRecordsMergeType = GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED; + + @Hidden + @Argument(shortName="multipleAllelesMergeType", doc="Determines how we should handle records seen at the same site in the VCF, but with different allele types (for example, SNP vs. indel)", required=false) + public GATKVariantContextUtils.MultipleAllelesMergeType multipleAllelesMergeType = GATKVariantContextUtils.MultipleAllelesMergeType.BY_TYPE; + + /** + * Used when taking the union of variants that contain genotypes. A complete priority list MUST be provided. + */ + @Argument(fullName="rod_priority_list", shortName="priority", doc="A comma-separated string describing the priority ordering for the genotypes as far as which record gets emitted", required=false) + public String PRIORITY_STRING = null; + + @Argument(fullName="printComplexMerges", shortName="printComplexMerges", doc="Print out interesting sites requiring complex compatibility merging", required=false) + public boolean printComplexMerges = false; + + @Argument(fullName="filteredAreUncalled", shortName="filteredAreUncalled", doc="If true, then filtered VCFs are treated as uncalled, so that filtered set annotations don't appear in the combined VCF", required=false) + public boolean filteredAreUncalled = false; + + /** + * Used to generate a sites-only file. + */ + @Argument(fullName="minimalVCF", shortName="minimalVCF", doc="If true, then the output VCF will contain no INFO or genotype FORMAT fields", required=false) + public boolean minimalVCF = false; + + @Argument(fullName="excludeNonVariants", shortName="env", doc="Don't include loci found to be non-variant after the combining procedure", required=false) + public boolean EXCLUDE_NON_VARIANTS = false; + + /** + * Set to 'null' if you don't want the set field emitted. + */ + @Argument(fullName="setKey", shortName="setKey", doc="Key used in the INFO key=value tag emitted describing which set the combined VCF record came from", required=false) + public String SET_KEY = "set"; + + /** + * This option allows the user to perform a simple merge (concatenation) to combine the VCFs, drastically reducing the runtime. + */ + @Argument(fullName="assumeIdenticalSamples", shortName="assumeIdenticalSamples", doc="If true, assume input VCFs have identical sample sets and disjoint calls", required=false) + public boolean ASSUME_IDENTICAL_SAMPLES = false; + + @Argument(fullName="minimumN", shortName="minN", doc="Combine variants and output site only if the variant is present in at least N input files.", required=false) + public int minimumN = 1; + + /** + * This option allows the suppression of the command line in the VCF header. This is most often usefully when combining variants for dozens or hundreds of smaller VCFs. + */ + @Argument(fullName="suppressCommandLineHeader", shortName="suppressCommandLineHeader", doc="If true, do not output the header containing the command line used", required=false) + public boolean SUPPRESS_COMMAND_LINE_HEADER = false; + + @Argument(fullName="mergeInfoWithMaxAC", shortName="mergeInfoWithMaxAC", doc="If true, when VCF records overlap the info field is taken from the one with the max AC instead of only taking the fields which are identical across the overlapping records.", required=false) + public boolean MERGE_INFO_WITH_MAX_AC = false; + + private List priority = null; + + /** Optimization to strip out genotypes before merging if we are doing a sites_only output */ + private boolean sitesOnlyVCF = false; + private Set samples; + + public void initialize() { + Map vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit()); + + if ( vcfWriter instanceof VariantContextWriterStub) { + sitesOnlyVCF = ((VariantContextWriterStub)vcfWriter).getWriterOptions().contains(Options.DO_NOT_WRITE_GENOTYPES); + if ( sitesOnlyVCF ) logger.info("Pre-stripping genotypes for performance"); + } else + logger.warn("VCF output file not an instance of VCFWriterStub; cannot enable sites only output option"); + + validateAnnotateUnionArguments(); + if ( PRIORITY_STRING == null && genotypeMergeOption == null) { + genotypeMergeOption = GATKVariantContextUtils.GenotypeMergeType.UNSORTED; + //PRIORITY_STRING = Utils.join(",", vcfRods.keySet()); Deleted by Ami (7/10/12) + logger.info("Priority string is not provided, using arbitrary genotyping order: "+priority); + } + + if (genotypeMergeOption == GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE && + !SampleUtils.verifyUniqueSamplesNames(vcfRods)) + throw new IllegalStateException("REQUIRE_UNIQUE sample names is true but duplicate names were discovered."); + + samples = sitesOnlyVCF ? Collections.emptySet() : SampleUtils.getSampleList(vcfRods, genotypeMergeOption); + + if ( SET_KEY.toLowerCase().equals("null") ) + SET_KEY = null; + + Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true); + if ( SET_KEY != null ) + headerLines.add(new VCFInfoHeaderLine(SET_KEY, 1, VCFHeaderLineType.String, "Source VCF for the merged record in CombineVariants")); + if ( !ASSUME_IDENTICAL_SAMPLES ) + headerLines.addAll(Arrays.asList(ChromosomeCountConstants.descriptions)); + VCFHeader vcfHeader = new VCFHeader(headerLines, samples); + vcfHeader.setWriteCommandLine(!SUPPRESS_COMMAND_LINE_HEADER); + vcfWriter.writeHeader(vcfHeader); + + // collect the actual rod bindings into a list for use later + for ( final RodBindingCollection variantCollection : variantCollections ) + variants.addAll(variantCollection.getRodBindings()); + } + + private void validateAnnotateUnionArguments() { + Set rodNames = SampleUtils.getRodNamesWithVCFHeader(getToolkit(), null); + + if ( genotypeMergeOption == GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE && PRIORITY_STRING == null ) + throw new UserException.MissingArgument("rod_priority_list", "Priority string must be provided if you want to prioritize genotypes"); + + if ( PRIORITY_STRING != null){ + priority = new ArrayList<>(Arrays.asList(PRIORITY_STRING.split(","))); + if ( rodNames.size() != priority.size() ) + throw new UserException.BadArgumentValue("rod_priority_list", "The priority list must contain exactly one rod binding per ROD provided to the GATK: rodNames=" + rodNames + " priority=" + priority); + + if ( ! rodNames.containsAll(priority) ) + throw new UserException.BadArgumentValue("rod_priority_list", "Not all priority elements provided as input RODs: " + PRIORITY_STRING); + } + + } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) // RodWalkers can make funky map calls + return 0; + + final Set rodNames = SampleUtils.getRodNamesWithVCFHeader(getToolkit(), null); + // get all of the vcf rods at this locus + // Need to provide reference bases to simpleMerge starting at current locus + Collection vcs = tracker.getValues(variants, context.getLocation()); + + if ( sitesOnlyVCF ) { + vcs = VariantContextUtils.sitesOnlyVariantContexts(vcs); + } + + if ( ASSUME_IDENTICAL_SAMPLES ) { + for ( final VariantContext vc : vcs ) { + vcfWriter.add(vc); + } + + return vcs.isEmpty() ? 0 : 1; + } + + int numFilteredRecords = 0; + for (final VariantContext vc : vcs) { + if (vc.filtersWereApplied() && vc.isFiltered()) + numFilteredRecords++; + } + + if (minimumN > 1 && (vcs.size() - numFilteredRecords < minimumN)) + return 0; + + final List mergedVCs = new ArrayList<>(); + + if (multipleAllelesMergeType == GATKVariantContextUtils.MultipleAllelesMergeType.BY_TYPE) { + final Map> VCsByType = GATKVariantContextUtils.separateVariantContextsByType(vcs); + + // TODO -- clean this up in a refactoring + // merge NO_VARIATION into another type of variant (based on the ordering in VariantContext.Type) + if ( VCsByType.containsKey(VariantContext.Type.NO_VARIATION) && VCsByType.size() > 1 ) { + final List refs = VCsByType.remove(VariantContext.Type.NO_VARIATION); + for ( final VariantContext.Type type : VariantContext.Type.values() ) { + if ( VCsByType.containsKey(type) ) { + VCsByType.get(type).addAll(refs); + break; + } + } + } + + // iterate over the types so that it's deterministic + for (final VariantContext.Type type : VariantContext.Type.values()) { + // make sure that it is a variant or in case it is not, that we want to include the sites with no variants + if (!EXCLUDE_NON_VARIANTS || !type.equals(VariantContext.Type.NO_VARIATION)) { + if (VCsByType.containsKey(type)) { + mergedVCs.add(GATKVariantContextUtils.simpleMerge(VCsByType.get(type), priority, rodNames.size(), + filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, + SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); + } + } + } + } + else if (multipleAllelesMergeType == GATKVariantContextUtils.MultipleAllelesMergeType.MIX_TYPES) { + mergedVCs.add(GATKVariantContextUtils.simpleMerge(vcs, priority, rodNames.size(), filteredRecordsMergeType, + genotypeMergeOption, true, printComplexMerges, SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); + } + else { + logger.warn("Ignoring all records at site " + ref.getLocus()); + } + + for ( final VariantContext mergedVC : mergedVCs ) { + // only operate at the start of events + if ( mergedVC == null ) + continue; + + final VariantContextBuilder builder = new VariantContextBuilder(mergedVC); + // re-compute chromosome counts + VariantContextUtils.calculateChromosomeCounts(builder, false); + + if ( minimalVCF ) + GATKVariantContextUtils.pruneVariantContext(builder, Arrays.asList(SET_KEY)); + final VariantContext vc = builder.make(); + if( !EXCLUDE_NON_VARIANTS || vc.isPolymorphicInSamples() ) + vcfWriter.add(builder.make()); + } + + return vcs.isEmpty() ? 0 : 1; + } + + public Integer reduceInit() { + return 0; + } + + public Integer reduce(Integer counter, Integer sum) { + return counter + sum; + } + + @Override + public Integer treeReduce(Integer lhs, Integer rhs) { + return reduce(lhs, rhs); + } + + public void onTraversalDone(Integer sum) {} +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetrics.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetrics.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetrics.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetrics.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java new file mode 100755 index 000000000..8c8961cb5 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java @@ -0,0 +1,633 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Input; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.report.GATKReport; +import org.broadinstitute.sting.gatk.report.GATKReportTable; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.variant.variantcontext.*; +import org.broadinstitute.variant.vcf.VCFHeader; + +import java.io.PrintStream; +import java.util.*; + +/** + * Genotype concordance (per-sample and aggregate counts and frequencies, NRD/NRS and site allele overlaps) between two callsets + * + *

+ * GenotypeConcordance takes in two callsets (vcfs) and tabulates the number of sites which overlap and share alleles, + * and for each sample, the genotype-by-genotype counts (e.g. the number of sites at which a sample was + * called homozygous-reference in the EVAL callset, but homozygous-variant in the COMP callset). It outputs these + * counts as well as convenient proportions (such as the proportion of het calls in the EVAL which were called REF in + * the COMP) and metrics (such as NRD and NRS). + *

+ * + *

Input

+ *

+ * Genotype concordance requires two callsets (as it does a comparison): an EVAL and a COMP callset, specified via + * the -eval and -comp arguments. Typically, the EVAL callset is an experimental set you want to evaluate, while the + * COMP callset is a previously existing set used as a standard for comparison (taken to represent "truth"). + *

+ *

+ * (Optional) Jexl expressions for genotype-level filtering of EVAL or COMP genotypes, specified via the -gfe and + * -cfe arguments, respectively. + *

+ * + *

Output

+ *

+ * Genotype Concordance writes a GATK report to the specified file (via -o), consisting of multiple tables of counts + * and proportions. These tables are constructed on a per-sample basis, and include counts of EVAL vs COMP genotype states, and the + * number of times the alternate alleles between the EVAL and COMP sample did not match up. + *

+ * + *

Term and metrics definitions

+ *

+ *

    + *
  • HET: heterozygous
  • + *
  • HOM_REF: homozygous reference
  • + *
  • HOM_VAR: homozygous variant
  • + *
  • MIXED: something like ./1
  • + *
  • ALLELES_MATCH: counts of calls at the same site where the alleles match
  • + *
  • ALLELES_DO_NOT_MATCH: counts of calls at the same location with different alleles, such as the eval set calling a 'G' alternate allele, and the comp set calling a 'T' alternate allele
  • + *
  • EVAL_ONLY: counts of sites present only in the EVAL set, not in the COMP set
  • + *
  • TRUTH_ONLY: counts of sites present only in the COMP set, not in the EVAL set
  • + *
  • Non-Reference_Discrepancy (NRD): genotype concordance excluding concordant reference sites
  • + *
  • Non-Reference_Sensitivity (NRS): sensitivity of the EVAL calls to polymorphic calls in the COMP set, calculated by (# true positive)/(# true polymorphic)
  • + *
  • Overall_Genotype_Concordance: overall concordance calculated by (# concordant genotypes)/(# genotypes)
  • + *
+ *

+ * + *

Moltenized tables

+ * + *

These tables may be optionally moltenized via the -moltenize argument. That is, the standard table + * + *

+ *  Sample   NO_CALL_HOM_REF  NO_CALL_HET  NO_CALL_HOM_VAR   (...)
+ *  NA12878       0.003        0.001            0.000        (...)
+ *  NA12891       0.005        0.000            0.000        (...)
+ *  
+ * + * would instead be displayed + * + *
+ *  NA12878  NO_CALL_HOM_REF   0.003
+ *  NA12878  NO_CALL_HET       0.001
+ *  NA12878  NO_CALL_HOM_VAR   0.000
+ *  NA12891  NO_CALL_HOM_REF   0.005
+ *  NA12891  NO_CALL_HET       0.000
+ *  NA12891  NO_CALL_HOM_VAR   0.000
+ *  (...)
+ *  
+ * + *

Site-level allelic concordance

+ * + *

+ * For strictly bi-allelic VCFs, only the ALLELES_MATCH, EVAL_ONLY, TRUTH_ONLY fields will be populated, + * but where multi-allelic sites are involved counts for EVAL_SUBSET_TRUTH and EVAL_SUPERSET_TRUTH will be generated. + *

+ *

+ * For example, in the following situation + *

+ *    eval:  ref - A   alt - C
+ *    comp:  ref - A   alt - C,T
+ *  
+ * then the site is tabulated as EVAL_SUBSET_TRUTH. Were the situation reversed, it would be EVAL_SUPERSET_TRUTH. + * However, in the case where EVAL has both C and T alternate alleles, both must be observed in the genotypes + * (that is, there must be at least one of (0/1,1/1) and at least one of (0/2,1/2,2/2) in the genotype field). If + * one of the alleles has no observations in the genotype fields of the EVAL, the site-level concordance is + * tabulated as though that allele were not present in the record. + *

+ * + *

Monomorphic Records

+ *

+ * A site which has an alternate allele, but which is monomorphic in samples, is treated as not having been + * discovered, and will be recorded in the TRUTH_ONLY column (if a record exists in the COMP set), or not at all + * (if no record exists in the COMP set). + *

+ *

+ * That is, in the situation + *

+ *   eval:  ref - A   alt - C   genotypes - 0/0  0/0  0/0 ... 0/0
+ *   comp:  ref - A   alt - C   ...         0/0  0/0  ...
+ *  
+ * is equivalent to + *
+ *   eval:  ref - A   alt - .   genotypes - 0/0  0/0  0/0 ... 0/0
+ *   comp:  ref - A   alt - C   ...         0/0  0/0  ...
+ *  
+ *

+ *

+ * When a record is present in the COMP set the *genotypes* for the monomorphic site will still be used to evaluate + * per-sample genotype concordance counts. + *

+ * + *

Filtered Records

+ * Filtered records are treated as though they were not present in the VCF, unless -ignoreSiteFilters is provided, + * in which case all records are used. There is currently no way to assess concordance metrics on filtered sites + * exclusively. SelectVariants can be used to extract filtered sites, and VariantFiltration used to un-filter them. + * + + */ +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) +public class GenotypeConcordance extends RodWalker>,ConcordanceMetrics> { + + /** + * The callset you want to evaluate, typically this is where you'd put 'unassessed' callsets. + */ + @Input(fullName="eval",shortName="eval",doc="The variants and genotypes to evaluate",required=true) + RodBinding evalBinding; + + /** + * The callset you want to treat as 'truth'. Can also be of unknown quality for the sake of callset comparisons. + */ + @Input(fullName="comp",shortName="comp",doc="The variants and genotypes to compare against",required=true) + RodBinding compBinding; + + /** + * The FILTER field of the eval and comp VCFs will be ignored. If this flag is not included, all FILTER sites will + * be treated as not being present in the VCF. (That is, the genotypes will be assigned UNAVAILABLE, as distinct + * from NO_CALL). + */ + @Argument(fullName="ignoreFilters",doc="Filters will be ignored",required=false) + boolean ignoreFilters = false; + + /** + * A genotype level JEXL expression to apply to eval genotypes. Genotypes filtered in this way will be replaced by NO_CALL. + * For instance: -gfe 'GQ<20' will set to no-call any genotype with genotype quality less than 20. + */ + @Argument(shortName="gfe", fullName="genotypeFilterExpressionEval", doc="One or more criteria to use to set EVAL genotypes to no-call. "+ + "These genotype-level filters are only applied to the EVAL rod.", required=false) + public ArrayList genotypeFilterExpressionsEval = new ArrayList(); + + /** + * Identical to -gfe except the filter is applied to genotypes in the comp rod. + */ + @Argument(shortName="gfc", fullName="genotypeFilterExpressionComp", doc="One or more criteria to use to set COMP genotypes to no-call. "+ + "These genotype-level filters are only applied to the COMP rod.", required=false) + public ArrayList genotypeFilterExpressionsComp = new ArrayList(); + + /** + * Moltenize the count and proportion tables. Rather than moltenizing per-sample data into a 2x2 table, it is fully + * moltenized into elements. That is, WITHOUT this argument, each row of the table begins with the sample name and + * proceeds directly with counts/proportions of eval/comp counts (for instance HOM_REF/HOM_REF, HOM_REF/NO_CALL). + * + * If the Moltenize argument is given, the output will begin with a sample name, followed by the contrastive genotype + * type (such as HOM_REF/HOM_REF), followed by the count or proportion. This will significantly increase the number of + * rows. + */ + @Argument(shortName="moltenize",fullName="moltenize",doc="Molten rather than tabular output") + public boolean moltenize = false; + + @Output + PrintStream out; + + private List evalSamples; + private List compSamples; + private List evalJexls = null; + private List compJexls = null; + + // todo -- table with "proportion of overlapping sites" (not just eval/comp margins) [e.g. drop no-calls] + // (this will break all the integration tests of course, due to new formatting) + + public void initialize() { + evalJexls = initializeJexl(genotypeFilterExpressionsEval); + compJexls = initializeJexl(genotypeFilterExpressionsComp); + } + + private List initializeJexl(ArrayList genotypeFilterExpressions) { + ArrayList dummyNames = new ArrayList(genotypeFilterExpressions.size()); + int expCount = 1; + for ( String exp : genotypeFilterExpressions ) { + dummyNames.add(String.format("gfe%d",expCount++)); + } + return VariantContextUtils.initializeMatchExps(dummyNames, genotypeFilterExpressions); + } + + public ConcordanceMetrics reduceInit() { + Map headerMap = GATKVCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList(evalBinding,compBinding)); + VCFHeader evalHeader = headerMap.get(evalBinding.getName()); + evalSamples = evalHeader.getGenotypeSamples(); + VCFHeader compHeader = headerMap.get(compBinding.getName()); + compSamples = compHeader.getGenotypeSamples(); + return new ConcordanceMetrics(evalHeader,compHeader); + } + + + public List> map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + List> evalCompPair = new ArrayList>(3); + if ( tracker != null && ( + tracker.getValues(evalBinding,ref.getLocus()).size() > 0 || + tracker.getValues(compBinding,ref.getLocus()).size() > 0 ) ) { + + List eval = tracker.getValues(evalBinding,ref.getLocus()); + List comp = tracker.getValues(compBinding,ref.getLocus()); + if ( eval.size() > 1 || comp.size() > 1 ) { + if ( noDuplicateTypes(eval) && noDuplicateTypes(comp) ) { + logger.info("Eval or Comp Rod at position " + ref.getLocus().toString() + " has multiple records. Resolving."); + evalCompPair = resolveMultipleRecords(eval,comp); + } else { + logger.warn("Eval or Comp Rod at position "+ref.getLocus().toString()+" has multiple records of the same type. This locus will be skipped."); + } + } else { + // if a rod is missing, explicitly create a variant context with 'missing' genotypes. Slow, but correct. + // note that if there is no eval rod there must be a comp rod, and also the reverse + VariantContext evalContext = eval.size() == 1 ? eval.get(0) : createEmptyContext(comp.get(0),evalSamples); + VariantContext compContext = comp.size() == 1 ? comp.get(0) : createEmptyContext(eval.get(0),compSamples); + evalContext = filterGenotypes(evalContext,ignoreFilters,evalJexls); + compContext = filterGenotypes(compContext,ignoreFilters,compJexls); + evalCompPair.add(new Pair(evalContext,compContext)); + } + } + + return evalCompPair; + } + + private boolean noDuplicateTypes(List vcList) { + HashSet types = new HashSet(vcList.size()); + for ( VariantContext vc : vcList ) { + VariantContext.Type type = vc.getType(); + if ( types.contains(type) ) + return false; + types.add(type); + } + + return true; + } + + /** + * The point of this method is to match up pairs of evals and comps by their type (or alternate alleles for mixed). + * Basically multiple records could exist for a site such as: + * Eval: 20 4000 A C + * Eval: 20 4000 A AC + * Comp: 20 4000 A C + * So for each eval, loop through the comps. If the types match, or for mixed types if eval alleles (non-emptily) + * intersect the comp alleles, pair them up and remove that comp records. + * Continue until we're out of evals or comps. This is n^2, but should rarely actually happen. + * + * The remaining unpaired records get paird with an empty contexts. So in the example above we'd get a list of: + * 1 - (20,4000,A/C | 20,4000,A/C) + * 2 - (20,4000,A/AC | Empty ) + * @param evalList - list of eval variant contexts + * @param compList - list of comp variant contexts + * @return resolved pairs of the input lists + */ + private List> resolveMultipleRecords(List evalList, List compList) { + List> resolvedPairs = new ArrayList>(evalList.size()+compList.size()); // oversized but w/e + List pairedEval = new ArrayList(evalList.size()); + for ( VariantContext eval : evalList ) { + VariantContext.Type evalType = eval.getType(); + Set evalAlleles = new HashSet(eval.getAlternateAlleles()); + VariantContext pairedComp = null; + for ( VariantContext comp : compList ) { + if ( evalType.equals(comp.getType()) ) { + pairedComp = comp; + break; + } else if ( eval.isMixed() || comp.isMixed() ) { + for ( Allele compAllele : comp.getAlternateAlleles() ) { + if ( evalAlleles.contains(compAllele) ) { + pairedComp = comp; + break; + } + } + } + } + if ( pairedComp != null ) { + compList.remove(pairedComp); + resolvedPairs.add(new Pair(filterGenotypes(eval,ignoreFilters,evalJexls),filterGenotypes(pairedComp,ignoreFilters,compJexls))); + pairedEval.add(eval); + if ( compList.size() < 1 ) + break; + } + } + evalList.removeAll(pairedEval); + for ( VariantContext unpairedEval : evalList ) { + resolvedPairs.add(new Pair(filterGenotypes(unpairedEval,ignoreFilters,evalJexls),createEmptyContext(unpairedEval,compSamples))); + } + + for ( VariantContext unpairedComp : compList ) { + resolvedPairs.add(new Pair(createEmptyContext(unpairedComp,evalSamples),filterGenotypes(unpairedComp,ignoreFilters,compJexls))); + } + + return resolvedPairs; + } + + public ConcordanceMetrics reduce(List> evalCompList, ConcordanceMetrics metrics) { + for ( Pair evalComp : evalCompList) + metrics.update(evalComp.getFirst(),evalComp.getSecond()); + return metrics; + } + + private static double repairNaN(double d) { + if ( Double.isNaN(d) ) { + return 0.0; + } + return d; + } + + public void onTraversalDone(ConcordanceMetrics metrics) { + // todo -- this is over 200 lines of code just to format the output and could use some serious cleanup + GATKReport report = new GATKReport(); + GATKReportTable concordanceCounts = new GATKReportTable("GenotypeConcordance_Counts","Per-sample concordance tables: comparison counts",2+GenotypeType.values().length*GenotypeType.values().length); + GATKReportTable concordanceEvalProportions = new GATKReportTable("GenotypeConcordance_EvalProportions", "Per-sample concordance tables: proportions of genotypes called in eval",2+GenotypeType.values().length*GenotypeType.values().length); + GATKReportTable concordanceCompProportions = new GATKReportTable("GenotypeConcordance_CompProportions", "Per-sample concordance tables: proportions of genotypes called in comp",2+GenotypeType.values().length*GenotypeType.values().length); + GATKReportTable concordanceSummary = new GATKReportTable("GenotypeConcordance_Summary","Per-sample summary statistics: NRS, NRD, and OGC",2); + GATKReportTable siteConcordance = new GATKReportTable("SiteConcordance_Summary","Site-level summary statistics",ConcordanceMetrics.SiteConcordanceType.values().length); + if ( moltenize ) { + concordanceCompProportions.addColumn("Sample","%s"); + concordanceCounts.addColumn("Sample","%s"); + concordanceEvalProportions.addColumn("Sample","%s"); + concordanceSummary.addColumn("Sample","%s"); + + concordanceCompProportions.addColumn("Eval_Genotype","%s"); + concordanceCounts.addColumn("Eval_Genotype","%s"); + concordanceEvalProportions.addColumn("Eval_Genotype","%s"); + concordanceSummary.addColumn("Non-Reference_Discrepancy","%.3f"); + + concordanceCompProportions.addColumn("Comp_Genotype","%s"); + concordanceCounts.addColumn("Comp_Genotype","%s"); + concordanceEvalProportions.addColumn("Comp_Genotype","%s"); + concordanceSummary.addColumn("Non-Reference_Sensitivity","%.3f"); + + concordanceCompProportions.addColumn("Proportion","%.3f"); + concordanceCounts.addColumn("Count","%d"); + concordanceEvalProportions.addColumn("Proportion","%.3f"); + concordanceSummary.addColumn("Overall_Genotype_Concordance","%.3f"); + + for ( Map.Entry entry : metrics.getPerSampleGenotypeConcordance().entrySet() ) { + ConcordanceMetrics.GenotypeConcordanceTable table = entry.getValue(); + for ( GenotypeType evalType : GenotypeType.values() ) { + for ( GenotypeType compType : GenotypeType.values() ) { + String rowKey = String.format("%s_%s_%s",entry.getKey(),evalType.toString(),compType.toString()); + concordanceCounts.set(rowKey,"Sample",entry.getKey()); + concordanceCounts.set(rowKey,"Eval_Genotype",evalType.toString()); + concordanceCounts.set(rowKey,"Comp_Genotype",compType.toString()); + int count = table.get(evalType, compType); + concordanceCounts.set(rowKey,"Count",count); + if ( evalType == GenotypeType.HET || evalType == GenotypeType.HOM_REF || evalType == GenotypeType.HOM_VAR) { + concordanceEvalProportions.set(rowKey,"Sample",entry.getKey()); + concordanceEvalProportions.set(rowKey,"Eval_Genotype",evalType.toString()); + concordanceEvalProportions.set(rowKey,"Comp_Genotype",compType.toString()); + concordanceEvalProportions.set(rowKey,"Proportion",repairNaN(( (double) count)/table.getnEvalGenotypes(evalType))); + } + if ( compType == GenotypeType.HET || compType == GenotypeType.HOM_VAR || compType == GenotypeType.HOM_REF ) { + concordanceCompProportions.set(rowKey,"Sample",entry.getKey()); + concordanceCompProportions.set(rowKey,"Eval_Genotype",evalType.toString()); + concordanceCompProportions.set(rowKey,"Comp_Genotype",compType.toString()); + concordanceCompProportions.set(rowKey,"Proportion",repairNaN(( (double) count)/table.getnCompGenotypes(compType))); + } + } + } + String mismatchKey = String.format("%s_%s",entry.getKey(),"Mismatching"); + concordanceCounts.set(mismatchKey,"Sample",entry.getKey()); + concordanceCounts.set(mismatchKey,"Eval_Genotype","Mismatching_Alleles"); + concordanceCounts.set(mismatchKey,"Comp_Genotype","Mismatching_Alleles"); + concordanceEvalProportions.set(mismatchKey,"Sample",entry.getKey()); + concordanceEvalProportions.set(mismatchKey,"Eval_Genotype","Mismatching_Alleles"); + concordanceEvalProportions.set(mismatchKey,"Comp_Genotype","Mismatching_Alleles"); + concordanceCompProportions.set(mismatchKey,"Sample",entry.getKey()); + concordanceCompProportions.set(mismatchKey,"Eval_Genotype","Mismatching_Alleles"); + concordanceCompProportions.set(mismatchKey,"Comp_Genotype","Mismatching_Alleles"); + concordanceEvalProportions.set(mismatchKey,"Proportion", repairNaN(( (double) table.getnMismatchingAlt() )/table.getnCalledEvalGenotypes())); + concordanceCompProportions.set(mismatchKey,"Proportion", repairNaN(( (double) table.getnMismatchingAlt() )/table.getnCalledCompGenotypes())); + concordanceCounts.set(mismatchKey,"Count",table.getnMismatchingAlt()); + } + + String sampleKey = "ALL"; + ConcordanceMetrics.GenotypeConcordanceTable table = metrics.getOverallGenotypeConcordance(); + for ( GenotypeType evalType : GenotypeType.values() ) { + for ( GenotypeType compType : GenotypeType.values() ) { + String rowKey = String.format("%s_%s_%s",sampleKey,evalType.toString(),compType.toString()); + concordanceCounts.set(rowKey,"Sample",sampleKey); + concordanceCounts.set(rowKey,"Eval_Genotype",evalType.toString()); + concordanceCounts.set(rowKey,"Comp_Genotype",compType.toString()); + int count = table.get(evalType, compType); + concordanceCounts.set(rowKey,"Count",count); + if ( evalType == GenotypeType.HET || evalType == GenotypeType.HOM_REF || evalType == GenotypeType.HOM_VAR) { + concordanceEvalProportions.set(rowKey,"Sample",sampleKey); + concordanceEvalProportions.set(rowKey,"Eval_Genotype",evalType.toString()); + concordanceEvalProportions.set(rowKey,"Comp_Genotype",compType.toString()); + concordanceEvalProportions.set(rowKey,"Proportion",repairNaN(( (double) count)/table.getnEvalGenotypes(evalType))); + } + if ( compType == GenotypeType.HET || compType == GenotypeType.HOM_VAR || compType == GenotypeType.HOM_REF ) { + concordanceCompProportions.set(rowKey,"Sample",sampleKey); + concordanceCompProportions.set(rowKey,"Eval_Genotype",evalType.toString()); + concordanceCompProportions.set(rowKey,"Comp_Genotype",compType.toString()); + concordanceCompProportions.set(rowKey,"Proportion",repairNaN(( (double) count)/table.getnCompGenotypes(compType))); + } + } + } + String rowKey = String.format("%s_%s",sampleKey,"Mismatching"); + concordanceCounts.set(rowKey,"Sample",sampleKey); + concordanceCounts.set(rowKey,"Eval_Genotype","Mismatching_Alleles"); + concordanceCounts.set(rowKey,"Comp_Genotype","Mismatching_Alleles"); + concordanceEvalProportions.set(rowKey,"Sample",sampleKey); + concordanceEvalProportions.set(rowKey,"Eval_Genotype","Mismatching_Alleles"); + concordanceEvalProportions.set(rowKey,"Comp_Genotype","Mismatching_Alleles"); + concordanceCompProportions.set(rowKey,"Sample",sampleKey); + concordanceCompProportions.set(rowKey,"Eval_Genotype","Mismatching_Alleles"); + concordanceCompProportions.set(rowKey,"Comp_Genotype","Mismatching_Alleles"); + concordanceEvalProportions.set(rowKey,"Proportion", repairNaN(( (double) table.getnMismatchingAlt() )/table.getnCalledEvalGenotypes())); + concordanceCompProportions.set(rowKey,"Proportion", repairNaN(( (double) table.getnMismatchingAlt() )/table.getnCalledCompGenotypes())); + concordanceCounts.set(rowKey,"Count",table.getnMismatchingAlt()); + + for ( Map.Entry nrsEntry : metrics.getPerSampleNRS().entrySet() ) { + concordanceSummary.set(nrsEntry.getKey(),"Sample",nrsEntry.getKey()); + concordanceSummary.set(nrsEntry.getKey(),"Non-Reference_Sensitivity",nrsEntry.getValue()); + } + for ( Map.Entry nrdEntry : metrics.getPerSampleNRD().entrySet() ) { + concordanceSummary.set(nrdEntry.getKey(),"Non-Reference_Discrepancy",nrdEntry.getValue()); + } + for ( Map.Entry ogcEntry : metrics.getPerSampleOGC().entrySet() ) { + concordanceSummary.set(ogcEntry.getKey(),"Overall_Genotype_Concordance",ogcEntry.getValue()); + } + concordanceSummary.set("ALL_NRS_NRD","Sample","ALL"); + concordanceSummary.set("ALL_NRS_NRD","Non-Reference_Sensitivity",metrics.getOverallNRS()); + concordanceSummary.set("ALL_NRS_NRD","Non-Reference_Discrepancy",metrics.getOverallNRD()); + concordanceSummary.set("ALL_NRS_NRD","Overall_Genotype_Concordance",metrics.getOverallOGC()); + + + for (ConcordanceMetrics.SiteConcordanceType type : ConcordanceMetrics.SiteConcordanceType.values() ) { + siteConcordance.addColumn(type.toString(),"%d"); + } + + for (ConcordanceMetrics.SiteConcordanceType type : ConcordanceMetrics.SiteConcordanceType.values() ) { + siteConcordance.set("Comparison",type.toString(),metrics.getOverallSiteConcordance().get(type)); + } + + } else { + concordanceCompProportions.addColumn("Sample","%s"); + concordanceCounts.addColumn("Sample","%s"); + concordanceEvalProportions.addColumn("Sample","%s"); + concordanceSummary.addColumn("Sample","%s"); + for ( GenotypeType evalType : GenotypeType.values() ) { + for ( GenotypeType compType : GenotypeType.values() ) { + String colKey = String.format("%s_%s", evalType.toString(), compType.toString()); + concordanceCounts.addColumn(colKey,"%d"); + if ( evalType == GenotypeType.HET || evalType == GenotypeType.HOM_REF || evalType == GenotypeType.HOM_VAR) + concordanceEvalProportions.addColumn(colKey,"%.3f"); + if ( compType == GenotypeType.HET || compType == GenotypeType.HOM_VAR || compType == GenotypeType.HOM_REF ) + concordanceCompProportions.addColumn(colKey,"%.3f"); + } + } + concordanceEvalProportions.addColumn("Mismatching_Alleles","%.3f"); + concordanceCompProportions.addColumn("Mismatching_Alleles","%.3f"); + concordanceCounts.addColumn("Mismatching_Alleles","%d"); + concordanceSummary.addColumn("Non-Reference Sensitivity","%.3f"); + concordanceSummary.addColumn("Non-Reference Discrepancy","%.3f"); + concordanceSummary.addColumn("Overall_Genotype_Concordance","%.3f"); + for (ConcordanceMetrics.SiteConcordanceType type : ConcordanceMetrics.SiteConcordanceType.values() ) { + siteConcordance.addColumn(type.toString(),"%d"); + } + + for ( Map.Entry entry : metrics.getPerSampleGenotypeConcordance().entrySet() ) { + ConcordanceMetrics.GenotypeConcordanceTable table = entry.getValue(); + concordanceEvalProportions.set(entry.getKey(),"Sample",entry.getKey()); + concordanceCompProportions.set(entry.getKey(),"Sample",entry.getKey()); + concordanceCounts.set(entry.getKey(),"Sample",entry.getKey()); + for ( GenotypeType evalType : GenotypeType.values() ) { + for ( GenotypeType compType : GenotypeType.values() ) { + String colKey = String.format("%s_%s",evalType.toString(),compType.toString()); + int count = table.get(evalType, compType); + concordanceCounts.set(entry.getKey(),colKey,count); + if ( evalType == GenotypeType.HET || evalType == GenotypeType.HOM_REF || evalType == GenotypeType.HOM_VAR) + concordanceEvalProportions.set(entry.getKey(),colKey,repairNaN(( (double) count)/table.getnEvalGenotypes(evalType))); + if ( compType == GenotypeType.HET || compType == GenotypeType.HOM_VAR || compType == GenotypeType.HOM_REF ) + concordanceCompProportions.set(entry.getKey(),colKey,repairNaN(( (double) count)/table.getnCompGenotypes(compType))); + } + } + concordanceEvalProportions.set(entry.getKey(),"Mismatching_Alleles", repairNaN(( (double) table.getnMismatchingAlt() )/table.getnCalledEvalGenotypes())); + concordanceCompProportions.set(entry.getKey(),"Mismatching_Alleles", repairNaN(( (double) table.getnMismatchingAlt() )/table.getnCalledCompGenotypes())); + concordanceCounts.set(entry.getKey(),"Mismatching_Alleles",table.getnMismatchingAlt()); + } + + String rowKey = "ALL"; + concordanceCompProportions.set(rowKey,"Sample",rowKey); + concordanceEvalProportions.set(rowKey,"Sample",rowKey); + concordanceCounts.set(rowKey,"Sample",rowKey); + ConcordanceMetrics.GenotypeConcordanceTable table = metrics.getOverallGenotypeConcordance(); + for ( GenotypeType evalType : GenotypeType.values() ) { + for ( GenotypeType compType : GenotypeType.values() ) { + String colKey = String.format("%s_%s",evalType.toString(),compType.toString()); + int count = table.get(evalType,compType); + concordanceCounts.set(rowKey,colKey,count); + if ( evalType == GenotypeType.HET || evalType == GenotypeType.HOM_REF || evalType == GenotypeType.HOM_VAR) + concordanceEvalProportions.set(rowKey,colKey,repairNaN(( (double) count)/table.getnEvalGenotypes(evalType))); + if ( compType == GenotypeType.HET || compType == GenotypeType.HOM_VAR || compType == GenotypeType.HOM_REF ) + concordanceCompProportions.set(rowKey,colKey,repairNaN(( (double) count)/table.getnCompGenotypes(compType))); + } + } + concordanceEvalProportions.set(rowKey,"Mismatching_Alleles", repairNaN(( (double) table.getnMismatchingAlt() )/table.getnCalledEvalGenotypes())); + concordanceCompProportions.set(rowKey,"Mismatching_Alleles", repairNaN(( (double) table.getnMismatchingAlt() )/table.getnCalledCompGenotypes())); + concordanceCounts.set(rowKey,"Mismatching_Alleles",table.getnMismatchingAlt()); + + for ( Map.Entry nrsEntry : metrics.getPerSampleNRS().entrySet() ) { + concordanceSummary.set(nrsEntry.getKey(),"Sample",nrsEntry.getKey()); + concordanceSummary.set(nrsEntry.getKey(),"Non-Reference Sensitivity",nrsEntry.getValue()); + } + for ( Map.Entry nrdEntry : metrics.getPerSampleNRD().entrySet() ) { + concordanceSummary.set(nrdEntry.getKey(),"Non-Reference Discrepancy",nrdEntry.getValue()); + } + for ( Map.Entry ogcEntry : metrics.getPerSampleOGC().entrySet() ) { + concordanceSummary.set(ogcEntry.getKey(),"Overall_Genotype_Concordance",ogcEntry.getValue()); + } + concordanceSummary.set("ALL","Sample","ALL"); + concordanceSummary.set("ALL","Non-Reference Sensitivity",metrics.getOverallNRS()); + concordanceSummary.set("ALL","Non-Reference Discrepancy",metrics.getOverallNRD()); + concordanceSummary.set("ALL","Overall_Genotype_Concordance",metrics.getOverallOGC()); + + for (ConcordanceMetrics.SiteConcordanceType type : ConcordanceMetrics.SiteConcordanceType.values() ) { + siteConcordance.set("Comparison",type.toString(),metrics.getOverallSiteConcordance().get(type)); + } + } + + report.addTable(concordanceCompProportions); + report.addTable(concordanceEvalProportions); + report.addTable(concordanceCounts); + report.addTable(concordanceSummary); + report.addTable(siteConcordance); + + report.print(out); + } + + public VariantContext createEmptyContext(VariantContext other, List samples) { + VariantContextBuilder builder = new VariantContextBuilder(); + // set the alleles to be the same + builder.alleles(other.getAlleles()); + builder.loc(other.getChr(),other.getStart(),other.getEnd()); + // set all genotypes to empty + List genotypes = new ArrayList(samples.size()); + for ( String sample : samples ) + genotypes.add(GenotypeBuilder.create(sample, new ArrayList(0))); + builder.genotypes(genotypes); + return builder.make(); + } + + public VariantContext filterGenotypes(VariantContext context, boolean ignoreSiteFilter, List exps) { + if ( ! context.isFiltered() || ignoreSiteFilter ) { + List filteredGenotypes = new ArrayList(context.getNSamples()); + for ( Genotype g : context.getGenotypes() ) { + Map matchMap = VariantContextUtils.match(context, g, exps); + boolean filtered = false; + for ( Boolean b : matchMap.values() ) { + if ( b ) { + filtered = true; + break; + } + } + if ( filtered ) { + filteredGenotypes.add(GenotypeBuilder.create(g.getSampleName(),Arrays.asList(Allele.NO_CALL,Allele.NO_CALL),g.getExtendedAttributes())); + } else { + filteredGenotypes.add(g); + } + } + VariantContextBuilder builder = new VariantContextBuilder(context); + builder.genotypes(filteredGenotypes); + return builder.make(); + } + + VariantContextBuilder builder = new VariantContextBuilder(); + builder.alleles(Arrays.asList(context.getReference())); + builder.loc(context.getChr(),context.getStart(),context.getEnd()); + List newGeno = new ArrayList(context.getNSamples()); + for ( Genotype g : context.getGenotypes().iterateInSampleNameOrder() ) { + newGeno.add(GenotypeBuilder.create(g.getSampleName(),new ArrayList())); + } + builder.genotypes(newGeno); + return builder.make(); + } +} diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariants.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariants.java new file mode 100644 index 000000000..5759abc41 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariants.java @@ -0,0 +1,304 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.ArgumentCollection; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.Reference; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.Window; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.vcf.VCFHeader; +import org.broadinstitute.variant.vcf.VCFHeaderLine; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.variant.variantcontext.*; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriterFactory; + +import java.util.*; + +/** + * Left-aligns indels from a variants file. + * + *

+ * LeftAlignAndTrimVariants is a tool that takes a VCF file and left-aligns the indels inside it. The same indel can often be + * placed at multiple positions and still represent the same haplotype. While the standard convention with VCF is to + * place an indel at the left-most position this doesn't always happen, so this tool can be used to left-align them. + * Note that this tool cannot handle anything other than bi-allelic, simple indels. Complex events are written out unchanged. + * Optionally, the tool will also trim common bases from indels, leaving them with a minimum representation. + * + *

Input

+ *

+ * A variant set to left-align and trim. + *

+ * + *

Output

+ *

+ * A left-aligned VCF. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T LeftAlignAndTrimVariants \
+ *   --variant input.vcf \
+ *   -o output.vcf
+ * 
+ * + */ +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) +@Reference(window=@Window(start=-200,stop=200)) // WARNING: if this changes,MAX_INDEL_LENGTH needs to change as well! +public class LeftAlignAndTrimVariants extends RodWalker { + + @ArgumentCollection + protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + + /** + * If this argument is set, bases common to all alleles will be removed, leaving only their minimal representation. + */ + @Argument(fullName="trimAlleles", shortName="trim", doc="Trim alleles to remove bases common to all of them", required=false) + protected boolean trimAlleles = false; + + /** + * If this argument is set, split multiallelic records and left-align individual alleles. + * If this argument is not set, multiallelic records are not attempted to left-align and will be copied as is. + */ + @Argument(fullName="splitMultiallelics", shortName="split", doc="Split multiallelic records and left-align individual alleles", required=false) + protected boolean splitMultiallelics = false; + + + @Output(doc="File to which variants should be written") + protected VariantContextWriter baseWriter = null; + + private VariantContextWriter writer; + + private static final int MAX_INDEL_LENGTH = 200; // needs to match reference window size! + public void initialize() { + String trackName = variantCollection.variants.getName(); + Set samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(trackName)); + Map vcfHeaders = GATKVCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList(trackName)); + + Set headerLines = vcfHeaders.get(trackName).getMetaDataInSortedOrder(); + baseWriter.writeHeader(new VCFHeader(headerLines, samples)); + + writer = VariantContextWriterFactory.sortOnTheFly(baseWriter, 200); + } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) + return 0; + + Collection VCs = tracker.getValues(variantCollection.variants, context.getLocation()); + + int changedSites = 0; + for ( final VariantContext vc : VCs ) { + // split first into biallelics, and optionally trim alleles to minimal representation + Pair result = new Pair(vc,0); // default value + if (splitMultiallelics) { + final List vcList = GATKVariantContextUtils.splitVariantContextToBiallelics(vc); + for (final VariantContext biallelicVC: vcList) { + final VariantContext v = (trimAlleles ? GATKVariantContextUtils.trimAlleles(biallelicVC,true,true) : biallelicVC); + result = alignAndWrite(v, ref); + + // strip out PLs and AD if we've subsetted the alleles + if ( vcList.size() > 1 ) + result.first = new VariantContextBuilder(result.first).genotypes(GATKVariantContextUtils.stripPLsAndAD(result.first.getGenotypes())).make(); + + writer.add(result.first); + changedSites += result.second; + } + } + else { + if (trimAlleles) + result = alignAndWrite(GATKVariantContextUtils.trimAlleles(vc,true,true), ref); + else + result = alignAndWrite(vc,ref); + writer.add(result.first); + changedSites += result.second; + + } + + } + + return changedSites; + } + + public Integer reduceInit() { return 0; } + + public Integer reduce(Integer value, Integer sum) { + return sum + value; + } + + public void onTraversalDone(Integer result) { + writer.close(); + System.out.println(result + " variants were aligned"); + } + + /** + * Main routine workhorse. By definitio, it will only take biallelic vc's. Splitting into multiple alleles has to be + * handled by calling routine. + * @param vc Input VC with variants to left align + * @param ref Reference context + * @return # of records left-aligned (0 or 1) and new VC. + */ + @Requires({"vc != null","ref != null", "vc.isBiallelic() == true","ref.getBases().length>=2*MAX_INDEL_LENGTH+1"}) + @Ensures({"result != null","result.first != null", "result.second >=0"}) + protected static Pair alignAndWrite(final VariantContext vc, final ReferenceContext ref) { + + final Pair retValue = new Pair(vc,0); + if (!vc.isIndel() || vc.isComplexIndel() ) { + return retValue; + } + + // get the indel length + final int indelLength; + if ( vc.isSimpleDeletion() ) + indelLength = vc.getReference().length() - 1; + else + indelLength = vc.getAlternateAllele(0).length() - 1; + + if ( indelLength > MAX_INDEL_LENGTH ) + return retValue; + + if (vc.getReference().getBases()[0] != vc.getAlternateAllele(0).getBases()[0]) + return retValue; + + final byte[] refSeq = ref.getBases(); + + // create an indel haplotype. + // + final int originalIndex = vc.getStart() - ref.getWindow().getStart() + 1; + if (originalIndex < 0 || originalIndex >= ref.getBases().length) + return retValue; + + final byte[] originalIndel = makeHaplotype(vc, refSeq, originalIndex, indelLength); + + // create a CIGAR string to represent the event + ArrayList elements = new ArrayList(); + elements.add(new CigarElement(originalIndex, CigarOperator.M)); + elements.add(new CigarElement(indelLength, vc.isSimpleDeletion() ? CigarOperator.D : CigarOperator.I)); + elements.add(new CigarElement(refSeq.length - originalIndex, CigarOperator.M)); + Cigar originalCigar = new Cigar(elements); + + // left align the CIGAR + Cigar newCigar = AlignmentUtils.leftAlignIndel(originalCigar, refSeq, originalIndel, 0, 0, true); + + // update if necessary and write + if ( !newCigar.equals(originalCigar) && newCigar.numCigarElements() > 1 ) { + int difference = originalIndex - newCigar.getCigarElement(0).getLength(); + VariantContext newVC = new VariantContextBuilder(vc).start(vc.getStart()-difference).stop(vc.getEnd()-difference).make(); + //System.out.println("Moving record from " + vc.getChr()+":"+vc.getStart() + " to " + vc.getChr()+":"+(vc.getStart()-difference)); + + final int indelIndex = originalIndex-difference; + final byte[] newBases = new byte[indelLength + 1]; + newBases[0] = refSeq[indelIndex-1]; + System.arraycopy((vc.isSimpleDeletion() ? refSeq : originalIndel), indelIndex, newBases, 1, indelLength); + final Allele newAllele = Allele.create(newBases, vc.isSimpleDeletion()); + newVC = updateAllele(newVC, newAllele); + // overwrite default return value with new left-aligned VC + retValue.first = newVC; + retValue.second = 1; + + } + return retValue; + } + + /** + * Make a haplotype from a given alt allele, using bases in input reference, index of an input reference + * @param vc Input VC - will use only alt allele from it + * @param ref Ref bases + * @param indexOfRef Index in ref where to create indel + * @param indelLength Indel length + * @return + */ + @Requires({"vc != null","ref != null", "indexOfRef +indelLength < ref.length", "vc.getNAlleles() == 2"}) + @Ensures("result != null") + private static byte[] makeHaplotype(VariantContext vc, byte[] ref, int indexOfRef, int indelLength) { + byte[] hap = new byte[ref.length + (indelLength * (vc.isSimpleDeletion() ? -1 : 1))]; + + // add the bases before the indel + System.arraycopy(ref, 0, hap, 0, indexOfRef); + int currentPos = indexOfRef; + + // take care of the indel + if ( vc.isSimpleDeletion() ) { + indexOfRef += indelLength; + } else { + System.arraycopy(vc.getAlternateAllele(0).getBases(), 1, hap, currentPos, indelLength); + currentPos += indelLength; + } + + // add the bases after the indel + System.arraycopy(ref, indexOfRef, hap, currentPos, ref.length - indexOfRef); + + return hap; + } + + public static VariantContext updateAllele(final VariantContext vc, final Allele newAllele) { + // create a mapping from original allele to new allele + HashMap alleleMap = new HashMap(vc.getAlleles().size()); + if ( newAllele.isReference() ) { + alleleMap.put(vc.getReference(), newAllele); + alleleMap.put(vc.getAlternateAllele(0), Allele.create(newAllele.getBases()[0], false)); + } else { + alleleMap.put(vc.getReference(), Allele.create(newAllele.getBases()[0], true)); + alleleMap.put(vc.getAlternateAllele(0), newAllele); + } + + // create new Genotype objects + GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); + for ( final Genotype genotype : vc.getGenotypes() ) { + List newAlleles = new ArrayList(); + for ( Allele allele : genotype.getAlleles() ) { + Allele newA = alleleMap.get(allele); + if ( newA == null ) + newA = Allele.NO_CALL; + newAlleles.add(newA); + } + newGenotypes.add(new GenotypeBuilder(genotype).alleles(newAlleles).make()); + } + + return new VariantContextBuilder(vc).alleles(alleleMap.values()).genotypes(newGenotypes).make(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToAllelicPrimitives.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToAllelicPrimitives.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToAllelicPrimitives.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToAllelicPrimitives.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java diff --git a/public/java/src/org/broadinstitute/sting/jna/clibrary/JNAUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/jna/clibrary/JNAUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/jna/clibrary/JNAUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/jna/clibrary/JNAUtils.java diff --git a/public/java/src/org/broadinstitute/sting/jna/clibrary/LibC.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/jna/clibrary/LibC.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/jna/clibrary/LibC.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/jna/clibrary/LibC.java diff --git a/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaJobInfo.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/jna/drmaa/v1_0/JnaJobInfo.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaJobInfo.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/jna/drmaa/v1_0/JnaJobInfo.java diff --git a/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaJobTemplate.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/jna/drmaa/v1_0/JnaJobTemplate.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaJobTemplate.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/jna/drmaa/v1_0/JnaJobTemplate.java diff --git a/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSession.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSession.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSession.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSession.java diff --git a/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionFactory.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionFactory.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionFactory.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionFactory.java diff --git a/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaa.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaa.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaa.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaa.java diff --git a/public/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBat.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBat.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBat.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBat.java diff --git a/public/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibLsf.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/jna/lsf/v7_0_6/LibLsf.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibLsf.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/jna/lsf/v7_0_6/LibLsf.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/tools/CatVariants.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/tools/CatVariants.java new file mode 100644 index 000000000..1dc5f8516 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/tools/CatVariants.java @@ -0,0 +1,290 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.tools; + +import net.sf.picard.reference.ReferenceSequenceFile; +import net.sf.picard.reference.ReferenceSequenceFileFactory; +import org.apache.log4j.BasicConfigurator; +import org.apache.log4j.Level; +import org.broad.tribble.AbstractFeatureReader; +import org.broad.tribble.FeatureReader; +import org.broad.tribble.index.IndexCreator; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Input; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.CommandLineProgram; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.variant.GATKVCFIndexType; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.variant.bcf2.BCF2Codec; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.variant.vcf.VCFCodec; +import org.broadinstitute.variant.vcf.VCFHeader; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.writer.Options; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriterFactory; + +import java.io.*; +import java.util.*; + + +/** + * + * Concatenates VCF files of non-overlapped genome intervals, all with the same set of samples + * + *

+ * The main purpose of this tool is to speed up the gather function when using scatter-gather parallelization. + * This tool concatenates the scattered output VCF files. It assumes that: + * - All the input VCFs (or BCFs) contain the same samples in the same order. + * - The variants in each input file are from non-overlapping (scattered) intervals. + * + * When the input files are already sorted based on the intervals start positions, use -assumeSorted. + * + * Note: Currently the tool is more efficient when working with VCFs; we will work to make it as efficient for BCFs. + * + *

+ * + *

Input

+ *

+ * One or more variant sets to combine. They should be of non-overlapping genome intervals and with the same samples (in the same order). + * The input files should be 'name.vcf' or 'name.VCF' or 'name.bcf' or 'name.BCF'. + * If the files are ordered according to the appearance of intervals in the ref genome, then one can use the -assumeSorted flag. + *

+ * + *

Output

+ *

+ * A combined VCF. The output file should be 'name.vcf' or 'name.VCF'. + * <\p> + * + *

Important note

+ *

This is a command-line utility that bypasses the GATK engine. As a result, the command-line you must use to + * invoke it is a little different from other GATK tools (see example below), and it does not accept any of the + * classic "CommandLineGATK" arguments.

+ * + *

Example

+ *
+ * java -cp GenomeAnalysisTK.jar org.broadinstitute.sting.tools.CatVariants \
+ *    -R ref.fasta \
+ *    -V input1.vcf \
+ *    -V input2.vcf \
+ *    -out output.vcf \
+ *    -assumeSorted
+ * 
+ * + * @author Ami Levy Moonshine + * @since Jan 2012 + */ + +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP ) +public class CatVariants extends CommandLineProgram { + // setup the logging system, used by some codecs + private static org.apache.log4j.Logger logger = org.apache.log4j.Logger.getRootLogger(); + + @Input(fullName = "reference", shortName = "R", doc = "genome reference file .fasta", required = true) + private File refFile = null; + + /** + * The VCF or BCF files to merge together + * + * CatVariants can take any number of -V arguments on the command line. Each -V argument + * will be included in the final merged output VCF. The order of arguments does not matter, but it runs more + * efficiently if they are sorted based on the intervals and the assumeSorted argument is used. + * + */ + @Input(fullName="variant", shortName="V", doc="Input VCF file/s named .vcf or .bcf", required = true) + private List variant = null; + + @Output(fullName = "outputFile", shortName = "out", doc = "output file name .vcf or .bcf", required = true) + private File outputFile = null; + + @Argument(fullName = "assumeSorted", shortName = "assumeSorted", doc = "assumeSorted should be true if he input files are already sorted (based on the position of the variants", required = false) + private Boolean assumeSorted = false; + + @Argument(fullName = "variant_index_type", doc = "which type of IndexCreator to use for VCF/BCF indices", required = false) + private GATKVCFIndexType variant_index_type = GATKVCFUtils.DEFAULT_INDEX_TYPE; + + @Argument(fullName = "variant_index_parameter", doc = "the parameter (bin width or features per bin) to pass to the VCF/BCF IndexCreator", required = false) + private Integer variant_index_parameter = GATKVCFUtils.DEFAULT_INDEX_PARAMETER; + + /* + * print usage information + */ + private static void printUsage() { + System.err.println("Usage: java -cp target/GenomeAnalysisTK.jar org.broadinstitute.sting.tools.CatVariants --reference --variant --outputFile [--assumeSorted]"); + System.err.println(" The input file(s) can be of type: VCF (must end in .vcf or .VCF) or"); + System.err.println(" BCF2 (must end in .bcf or .BCF)."); + System.err.println(" Output file must be of type vcf or bcf (must end in .vcf or .bcf)."); + System.err.println(" If the input files are already sorted, then indicate that with --assumeSorted to improve performance."); + } + + @Override + protected int execute() throws Exception { + //if(help){ + // printUsage(); + // return 1; + //} + + BasicConfigurator.configure(); + logger.setLevel(Level.INFO); + + final ReferenceSequenceFile ref; + try { + ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile); + } catch ( Exception e ) { + throw new UserException("Couldn't load provided reference sequence file " + refFile, e); + } + + Comparator> positionComparator = new PositionComparator(); + + + //PriorityQueue>> queue = + // new PriorityQueue>>(2000, comparator); + Queue> priorityQueue; + if(assumeSorted) + priorityQueue = new LinkedList>(); + else + priorityQueue = new PriorityQueue>(10000, positionComparator); + + Iterator files = variant.iterator(); + File file; + while (files.hasNext()) { + file = files.next(); + if (!(file.getName().endsWith(".vcf") || file.getName().endsWith(".VCF") || file.getName().endsWith(".bcf") || file.getName().endsWith(".BCF"))){ + System.err.println("File " + file.getAbsolutePath() + " should be .vcf or .bcf"); + printUsage(); + return 1; + } + if (assumeSorted){ + priorityQueue.add(new Pair(0,file)); + } + else{ + if (!file.exists()) { + throw new UserException(String.format("File %s doesn't exist",file.getAbsolutePath())); + } + FeatureReader reader; + boolean useVCF = (file.getName().endsWith(".vcf") || file.getName().endsWith(".VCF")); + if(useVCF) + reader = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), new VCFCodec(), false); + else + reader = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), new BCF2Codec(), false); + Iterator it = reader.iterator(); + if(!it.hasNext()){ + System.err.println(String.format("File %s is empty. This file will be ignored",file.getAbsolutePath())); + continue; + } + VariantContext vc = it.next(); + int firstPosition = vc.getStart(); + reader.close(); + //queue.add(new Pair>(firstPosition,reader)); + priorityQueue.add(new Pair(firstPosition,file)); + } + + } + + if (!(outputFile.getName().endsWith(".vcf") || outputFile.getName().endsWith(".VCF"))){ + throw new UserException(String.format("Output file %s should be .vcf", outputFile)); + } + + FileOutputStream outputStream = new FileOutputStream(outputFile); + EnumSet options = EnumSet.of(Options.INDEX_ON_THE_FLY); + final IndexCreator idxCreator = GATKVCFUtils.getIndexCreator(variant_index_type, variant_index_parameter, outputFile); + final VariantContextWriter outputWriter = VariantContextWriterFactory.create(outputFile, outputStream, ref.getSequenceDictionary(), idxCreator, options); + + boolean firstFile = true; + int count =0; + //while(!queue.isEmpty()){ + while(!priorityQueue.isEmpty() ){ + count++; + //FeatureReader reader = queue.remove().getSecond(); + file = priorityQueue.remove().getSecond(); + if (!file.exists()) { + throw new UserException(String.format("File %s doesn't exist",file.getAbsolutePath())); + } + FeatureReader reader; + boolean useVCF = (file.getName().endsWith(".vcf") || file.getName().endsWith(".VCF")); + if(useVCF) + reader = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), new VCFCodec(), false); + else + reader = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), new BCF2Codec(), false); + + if(count%10 ==0) + System.out.print(count); + else + System.out.print("."); + if (firstFile){ + VCFHeader header = (VCFHeader)reader.getHeader(); + outputWriter.writeHeader(header); + firstFile = false; + } + + Iterator it = reader.iterator(); + + while (it.hasNext()){ + VariantContext vc = it.next(); + outputWriter.add(vc); + } + + reader.close(); + + } + System.out.println(); + + outputStream.close(); + outputWriter.close(); + + return 0; + } + + + public static void main(String[] args){ + try { + CatVariants instance = new CatVariants(); + start(instance, args); + System.exit(CommandLineProgram.result); + } catch ( UserException e ) { + printUsage(); + exitSystemWithUserError(e); + } catch ( Exception e ) { + exitSystemWithError(e); + } + } + + private static class PositionComparator implements Comparator> { + + @Override + public int compare(Pair p1, Pair p2) { + int startPositionP1 = p1.getFirst(); + int startPositionP2 = p2.getFirst(); + if (startPositionP1 == startPositionP2) + return 0; + return startPositionP1 < startPositionP2 ? -1 : 1 ; + } + } + +} diff --git a/public/java/src/org/broadinstitute/sting/tools/ListAnnotations.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/tools/ListAnnotations.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/tools/ListAnnotations.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/tools/ListAnnotations.java diff --git a/public/java/src/org/broadinstitute/sting/utils/AutoFormattingTime.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/AutoFormattingTime.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/AutoFormattingTime.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/AutoFormattingTime.java diff --git a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/BaseUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/BaseUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/BaseUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/BitSetUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/BitSetUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/BitSetUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/BitSetUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/ContigComparator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/ContigComparator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/ContigComparator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/ContigComparator.java diff --git a/public/java/src/org/broadinstitute/sting/utils/DeprecatedToolChecks.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/DeprecatedToolChecks.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/DeprecatedToolChecks.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/DeprecatedToolChecks.java diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/GenomeLoc.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/GenomeLoc.java diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/GenomeLocParser.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/GenomeLocParser.java diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/GenomeLocSortedSet.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/GenomeLocSortedSet.java diff --git a/public/java/src/org/broadinstitute/sting/utils/HasGenomeLocation.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/HasGenomeLocation.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/HasGenomeLocation.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/HasGenomeLocation.java diff --git a/public/java/src/org/broadinstitute/sting/utils/HeapSizeMonitor.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/HeapSizeMonitor.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/HeapSizeMonitor.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/HeapSizeMonitor.java diff --git a/public/java/src/org/broadinstitute/sting/utils/IndelUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/IndelUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/IndelUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/IndelUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/LRUCache.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/LRUCache.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/LRUCache.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/LRUCache.java diff --git a/public/java/src/org/broadinstitute/sting/utils/MRUCachingSAMSequenceDictionary.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/MRUCachingSAMSequenceDictionary.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/MRUCachingSAMSequenceDictionary.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/MRUCachingSAMSequenceDictionary.java diff --git a/public/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/MannWhitneyU.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/MannWhitneyU.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/MathUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/MathUtils.java new file mode 100644 index 000000000..e73797705 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/MathUtils.java @@ -0,0 +1,1576 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.apache.commons.math.distribution.ExponentialDistribution; +import org.apache.commons.math.distribution.ExponentialDistributionImpl; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.math.BigDecimal; +import java.util.*; + +/** + * MathUtils is a static class (no instantiation allowed!) with some useful math methods. + * + * @author Kiran Garimella + */ +public class MathUtils { + + /** + * Private constructor. No instantiating this class! + */ + private MathUtils() { + } + + public static final double[] log10Cache; + public static final double[] log10FactorialCache; + private static final double[] jacobianLogTable; + private static final double JACOBIAN_LOG_TABLE_STEP = 0.0001; + private static final double JACOBIAN_LOG_TABLE_INV_STEP = 1.0 / JACOBIAN_LOG_TABLE_STEP; + private static final double MAX_JACOBIAN_TOLERANCE = 8.0; + private static final int JACOBIAN_LOG_TABLE_SIZE = (int) (MAX_JACOBIAN_TOLERANCE / JACOBIAN_LOG_TABLE_STEP) + 1; + private static final int MAXN = 70_000; + private static final int LOG10_CACHE_SIZE = 4 * MAXN; // we need to be able to go up to 2*(2N) when calculating some of the coefficients + + /** + * The smallest log10 value we'll emit from normalizeFromLog10 and other functions + * where the real-space value is 0.0. + */ + public static final double LOG10_P_OF_ZERO = -1000000.0; + public static final double FAIR_BINOMIAL_PROB_LOG10_0_5 = Math.log10(0.5); + public static final double LOG_ONE_HALF = -Math.log10(2.0); + public static final double LOG_ONE_THIRD = -Math.log10(3.0); + private static final double NATURAL_LOG_OF_TEN = Math.log(10.0); + private static final double SQUARE_ROOT_OF_TWO_TIMES_PI = Math.sqrt(2.0 * Math.PI); + + static { + log10Cache = new double[LOG10_CACHE_SIZE]; + log10FactorialCache = new double[LOG10_CACHE_SIZE]; + jacobianLogTable = new double[JACOBIAN_LOG_TABLE_SIZE]; + + log10Cache[0] = Double.NEGATIVE_INFINITY; + log10FactorialCache[0] = 0.0; + for (int k = 1; k < LOG10_CACHE_SIZE; k++) { + log10Cache[k] = Math.log10(k); + log10FactorialCache[k] = log10FactorialCache[k-1] + log10Cache[k]; + } + + for (int k = 0; k < JACOBIAN_LOG_TABLE_SIZE; k++) { + jacobianLogTable[k] = Math.log10(1.0 + Math.pow(10.0, -((double) k) * JACOBIAN_LOG_TABLE_STEP)); + + } + } + + /** + * Get a random int between min and max (inclusive) using the global GATK random number generator + * + * @param min lower bound of the range + * @param max upper bound of the range + * @return a random int >= min and <= max + */ + public static int randomIntegerInRange( final int min, final int max ) { + return GenomeAnalysisEngine.getRandomGenerator().nextInt(max - min + 1) + min; + } + + // A fast implementation of the Math.round() method. This method does not perform + // under/overflow checking, so this shouldn't be used in the general case (but is fine + // if one is already make those checks before calling in to the rounding). + public static int fastRound(final double d) { + return (d > 0.0) ? (int) (d + 0.5d) : (int) (d - 0.5d); + } + + public static double approximateLog10SumLog10(final double[] vals) { + return approximateLog10SumLog10(vals, vals.length); + } + + public static double approximateLog10SumLog10(final double[] vals, final int endIndex) { + + final int maxElementIndex = MathUtils.maxElementIndex(vals, endIndex); + double approxSum = vals[maxElementIndex]; + + for (int i = 0; i < endIndex; i++) { + if (i == maxElementIndex || vals[i] == Double.NEGATIVE_INFINITY) + continue; + + final double diff = approxSum - vals[i]; + if (diff < MathUtils.MAX_JACOBIAN_TOLERANCE) { + // See notes from the 2-inout implementation below + final int ind = fastRound(diff * MathUtils.JACOBIAN_LOG_TABLE_INV_STEP); // hard rounding + approxSum += MathUtils.jacobianLogTable[ind]; + } + } + + return approxSum; + } + + public static double approximateLog10SumLog10(final double a, final double b, final double c) { + return approximateLog10SumLog10(a, approximateLog10SumLog10(b, c)); + } + + public static double approximateLog10SumLog10(double small, double big) { + // make sure small is really the smaller value + if (small > big) { + final double t = big; + big = small; + small = t; + } + + if (small == Double.NEGATIVE_INFINITY || big == Double.NEGATIVE_INFINITY) + return big; + + final double diff = big - small; + if (diff >= MathUtils.MAX_JACOBIAN_TOLERANCE) + return big; + + // OK, so |y-x| < tol: we use the following identity then: + // we need to compute log10(10^x + 10^y) + // By Jacobian logarithm identity, this is equal to + // max(x,y) + log10(1+10^-abs(x-y)) + // we compute the second term as a table lookup with integer quantization + // we have pre-stored correction for 0,0.1,0.2,... 10.0 + final int ind = fastRound(diff * MathUtils.JACOBIAN_LOG_TABLE_INV_STEP); // hard rounding + return big + MathUtils.jacobianLogTable[ind]; + } + + public static double sum(final double[] values) { + double s = 0.0; + for (double v : values) + s += v; + return s; + } + + public static long sum(final int[] x) { + long total = 0; + for (int v : x) + total += v; + return total; + } + + public static int sum(final byte[] x) { + int total = 0; + for (byte v : x) + total += (int)v; + return total; + } + + public static double percentage(int x, int base) { + return (base > 0 ? ((double) x / (double) base) * 100.0 : 0); + } + + public static double ratio(final int num, final int denom) { + if ( denom > 0 ) { + return ((double) num)/denom; + } else { + if ( num == 0 && denom == 0) { + return 0.0; + } else { + throw new ReviewedStingException(String.format("The denominator of a ratio cannot be zero or less than zero: %d/%d",num,denom)); + } + } + } + + public static double ratio(final long num, final long denom) { + if ( denom > 0L ) { + return ((double) num)/denom; + } else { + if ( num == 0L && denom == 0L ) { + return 0.0; + } else { + throw new ReviewedStingException(String.format("The denominator of a ratio cannot be zero or less than zero: %d/%d",num,denom)); + } + } + } + + /** + * Converts a real space array of numbers (typically probabilities) into a log10 array + * + * @param prRealSpace + * @return + */ + public static double[] toLog10(final double[] prRealSpace) { + double[] log10s = new double[prRealSpace.length]; + for (int i = 0; i < prRealSpace.length; i++) { + log10s[i] = Math.log10(prRealSpace[i]); + } + return log10s; + } + + public static double log10sumLog10(final double[] log10p, final int start) { + return log10sumLog10(log10p, start, log10p.length); + } + + public static double log10sumLog10(final double[] log10p,final int start,final int finish) { + double sum = 0.0; + + double maxValue = arrayMax(log10p, finish); + if(maxValue == Double.NEGATIVE_INFINITY) + return maxValue; + + for (int i = start; i < finish; i++) { + if ( Double.isNaN(log10p[i]) || log10p[i] == Double.POSITIVE_INFINITY ) { + throw new IllegalArgumentException("log10p: Values must be non-infinite and non-NAN"); + } + sum += Math.pow(10.0, log10p[i] - maxValue); + } + + return Math.log10(sum) + maxValue; + } + + public static double sumLog10(final double[] log10values) { + return Math.pow(10.0, log10sumLog10(log10values)); + } + + public static double log10sumLog10(final double[] log10values) { + return log10sumLog10(log10values, 0); + } + + public static boolean wellFormedDouble(final double val) { + return !Double.isInfinite(val) && !Double.isNaN(val); + } + + public static double bound(final double value, final double minBoundary, final double maxBoundary) { + return Math.max(Math.min(value, maxBoundary), minBoundary); + } + + public static boolean isBounded(final double val, final double lower, final double upper) { + return val >= lower && val <= upper; + } + + public static boolean isPositive(final double val) { + return !isNegativeOrZero(val); + } + + public static boolean isPositiveOrZero(final double val) { + return isBounded(val, 0.0, Double.POSITIVE_INFINITY); + } + + public static boolean isNegativeOrZero(final double val) { + return isBounded(val, Double.NEGATIVE_INFINITY, 0.0); + } + + public static boolean isNegative(final double val) { + return !isPositiveOrZero(val); + } + + /** + * Compares double values for equality (within 1e-6), or inequality. + * + * @param a the first double value + * @param b the second double value + * @return -1 if a is greater than b, 0 if a is equal to be within 1e-6, 1 if b is greater than a. + */ + public static byte compareDoubles(final double a, final double b) { + return compareDoubles(a, b, 1e-6); + } + + /** + * Compares double values for equality (within epsilon), or inequality. + * + * @param a the first double value + * @param b the second double value + * @param epsilon the precision within which two double values will be considered equal + * @return -1 if a is greater than b, 0 if a is equal to be within epsilon, 1 if b is greater than a. + */ + public static byte compareDoubles(final double a, final double b, final double epsilon) { + if (Math.abs(a - b) < epsilon) { + return 0; + } + if (a > b) { + return -1; + } + return 1; + } + + /** + * Calculate f(x) = Normal(x | mu = mean, sigma = sd) + * @param mean the desired mean of the Normal distribution + * @param sd the desired standard deviation of the Normal distribution + * @param x the value to evaluate + * @return a well-formed double + */ + public static double normalDistribution(final double mean, final double sd, final double x) { + if( sd < 0 ) + throw new IllegalArgumentException("sd: Standard deviation of normal must be >0"); + if ( ! wellFormedDouble(mean) || ! wellFormedDouble(sd) || ! wellFormedDouble(x) ) + throw new IllegalArgumentException("mean, sd, or, x : Normal parameters must be well formatted (non-INF, non-NAN)"); + double a = 1.0 / (sd * Math.sqrt(2.0 * Math.PI)); + double b = Math.exp(-1.0 * (Math.pow(x - mean, 2.0) / (2.0 * sd * sd))); + return a * b; + } + + /** + * Calculate f(x) = log10 ( Normal(x | mu = mean, sigma = sd) ) + * @param mean the desired mean of the Normal distribution + * @param sd the desired standard deviation of the Normal distribution + * @param x the value to evaluate + * @return a well-formed double + */ + + public static double normalDistributionLog10(final double mean, final double sd, final double x) { + if( sd < 0 ) + throw new IllegalArgumentException("sd: Standard deviation of normal must be >0"); + if ( ! wellFormedDouble(mean) || ! wellFormedDouble(sd) || ! wellFormedDouble(x) ) + throw new IllegalArgumentException("mean, sd, or, x : Normal parameters must be well formatted (non-INF, non-NAN)"); + final double a = -1.0 * Math.log10(sd * SQUARE_ROOT_OF_TWO_TIMES_PI); + final double b = -1.0 * (square(x - mean) / (2.0 * square(sd))) / NATURAL_LOG_OF_TEN; + return a + b; + } + + /** + * Calculate f(x) = x^2 + * @param x the value to square + * @return x * x + */ + public static double square(final double x) { + return x * x; + } + + /** + * Calculates the log10 of the binomial coefficient. Designed to prevent + * overflows even with very large numbers. + * + * @param n total number of trials + * @param k number of successes + * @return the log10 of the binomial coefficient + */ + public static double binomialCoefficient(final int n, final int k) { + return Math.pow(10, log10BinomialCoefficient(n, k)); + } + + /** + * @see #binomialCoefficient(int, int) with log10 applied to result + */ + public static double log10BinomialCoefficient(final int n, final int k) { + if ( n < 0 ) { + throw new IllegalArgumentException("n: Must have non-negative number of trials"); + } + if ( k > n || k < 0 ) { + throw new IllegalArgumentException("k: Must have non-negative number of successes, and no more successes than number of trials"); + } + + return log10Factorial(n) - log10Factorial(k) - log10Factorial(n - k); + } + + /** + * Computes a binomial probability. This is computed using the formula + *

+ * B(k; n; p) = [ n! / ( k! (n - k)! ) ] (p^k)( (1-p)^k ) + *

+ * where n is the number of trials, k is the number of successes, and p is the probability of success + * + * @param n number of Bernoulli trials + * @param k number of successes + * @param p probability of success + * @return the binomial probability of the specified configuration. Computes values down to about 1e-237. + */ + public static double binomialProbability(final int n, final int k, final double p) { + return Math.pow(10, log10BinomialProbability(n, k, Math.log10(p))); + } + + /** + * @see #binomialProbability(int, int, double) with log10 applied to result + */ + public static double log10BinomialProbability(final int n, final int k, final double log10p) { + if ( log10p > 1e-18 ) + throw new IllegalArgumentException("log10p: Log-probability must be 0 or less"); + double log10OneMinusP = Math.log10(1 - Math.pow(10, log10p)); + return log10BinomialCoefficient(n, k) + log10p * k + log10OneMinusP * (n - k); + } + + /** + * @see #binomialProbability(int, int, double) with p=0.5 + */ + public static double binomialProbability(final int n, final int k) { + return Math.pow(10, log10BinomialProbability(n, k)); + } + + /** + * @see #binomialProbability(int, int, double) with p=0.5 and log10 applied to result + */ + public static double log10BinomialProbability(final int n, final int k) { + return log10BinomialCoefficient(n, k) + (n * FAIR_BINOMIAL_PROB_LOG10_0_5); + } + + /** A memoization container for {@link #binomialCumulativeProbability(int, int, int)}. Synchronized to accomodate multithreading. */ + private static final Map BINOMIAL_CUMULATIVE_PROBABILITY_MEMOIZATION_CACHE = + Collections.synchronizedMap(new LRUCache(10_000)); + + /** + * Primitive integer-triplet bijection into long. Returns null when the bijection function fails (in lieu of an exception), which will + * happen when: any value is negative or larger than a short. This method is optimized for speed; it is not intended to serve as a + * utility function. + */ + static Long fastGenerateUniqueHashFromThreeIntegers(final int one, final int two, final int three) { + if (one < 0 || two < 0 || three < 0 || Short.MAX_VALUE < one || Short.MAX_VALUE < two || Short.MAX_VALUE < three) { + return null; + } else { + long result = 0; + result += (short) one; + result <<= 16; + result += (short) two; + result <<= 16; + result += (short) three; + return result; + } + } + + /** + * Performs the cumulative sum of binomial probabilities, where the probability calculation is done in log space. + * Assumes that the probability of a successful hit is fair (i.e. 0.5). + * + * This pure function is memoized because of its expensive BigDecimal calculations. + * + * @param n number of attempts for the number of hits + * @param k_start start (inclusive) of the cumulant sum (over hits) + * @param k_end end (inclusive) of the cumulant sum (over hits) + * @return - returns the cumulative probability + */ + public static double binomialCumulativeProbability(final int n, final int k_start, final int k_end) { + if ( k_end > n ) + throw new IllegalArgumentException(String.format("Value for k_end (%d) is greater than n (%d)", k_end, n)); + + // Fetch cached value, if applicable. + final Long memoizationKey = fastGenerateUniqueHashFromThreeIntegers(n, k_start, k_end); + final Double memoizationCacheResult; + if (memoizationKey != null) { + memoizationCacheResult = BINOMIAL_CUMULATIVE_PROBABILITY_MEMOIZATION_CACHE.get(memoizationKey); + } else { + memoizationCacheResult = null; + } + + final double result; + if (memoizationCacheResult != null) { + result = memoizationCacheResult; + } else { + double cumProb = 0.0; + double prevProb; + BigDecimal probCache = BigDecimal.ZERO; + + for (int hits = k_start; hits <= k_end; hits++) { + prevProb = cumProb; + final double probability = binomialProbability(n, hits); + cumProb += probability; + if (probability > 0 && cumProb - prevProb < probability / 2) { // loss of precision + probCache = probCache.add(new BigDecimal(prevProb)); + cumProb = 0.0; + hits--; // repeat loop + // prevProb changes at start of loop + } + } + + result = probCache.add(new BigDecimal(cumProb)).doubleValue(); + if (memoizationKey != null) { + BINOMIAL_CUMULATIVE_PROBABILITY_MEMOIZATION_CACHE.put(memoizationKey, result); + } + } + return result; + } + + private static final double LOG1MEXP_THRESHOLD = Math.log(0.5); + + private static final double LN_10 = Math.log(10); + + /** + * Calculates {@code log(1-exp(a))} without loosing precision. + * + *

+ * This is based on the approach described in: + * + *

+ *

+ * Maechler M, Accurately Computing log(1-exp(-|a|)) Assessed by the Rmpfr package, 2012
+ * Online document. + * + *

+ * + * @param a the input exponent. + * @return {@link Double#NaN NaN} if {@code a > 0}, otherwise the corresponding value. + */ + public static double log1mexp(final double a) { + if (a > 0) return Double.NaN; + if (a == 0) return Double.NEGATIVE_INFINITY; + + return (a < LOG1MEXP_THRESHOLD) ? Math.log1p(-Math.exp(a)) : Math.log(-Math.expm1(a)); + } + + /** + * Calculates {@code log10(1-10^a)} without loosing precision. + * + *

+ * This is based on the approach described in: + * + *

+ *

+ * Maechler M, Accurately Computing log(1-exp(-|a|)) Assessed by the Rmpfr package, 2012
+ * Online document. + *

+ * + * @param a the input exponent. + * @return {@link Double#NaN NaN} if {@code a > 0}, otherwise the corresponding value. + */ + public static double log10OneMinusPow10(final double a) { + if (a > 0) return Double.NaN; + if (a == 0) return Double.NEGATIVE_INFINITY; + final double b = a * LN_10; + return log1mexp(b) / LN_10; + } + + /** + * Calculates the log10 of the multinomial coefficient. Designed to prevent + * overflows even with very large numbers. + * + * @param n total number of trials + * @param k array of any size with the number of successes for each grouping (k1, k2, k3, ..., km) + * @return {@link Double#NaN NaN} if {@code a > 0}, otherwise the corresponding value. + */ + public static double log10MultinomialCoefficient(final int n, final int[] k) { + if ( n < 0 ) + throw new IllegalArgumentException("n: Must have non-negative number of trials"); + double denominator = 0.0; + int sum = 0; + for (int x : k) { + if ( x < 0 ) + throw new IllegalArgumentException("x element of k: Must have non-negative observations of group"); + if ( x > n ) + throw new IllegalArgumentException("x element of k, n: Group observations must be bounded by k"); + denominator += log10Factorial(x); + sum += x; + } + if ( sum != n ) + throw new IllegalArgumentException("k and n: Sum of observations in multinomial must sum to total number of trials"); + return log10Factorial(n) - denominator; + } + + /** + * Computes the log10 of the multinomial distribution probability given a vector + * of log10 probabilities. Designed to prevent overflows even with very large numbers. + * + * @param n number of trials + * @param k array of number of successes for each possibility + * @param log10p array of log10 probabilities + * @return + */ + public static double log10MultinomialProbability(final int n, final int[] k, final double[] log10p) { + if (log10p.length != k.length) + throw new IllegalArgumentException("p and k: Array of log10 probabilities must have the same size as the array of number of sucesses: " + log10p.length + ", " + k.length); + double log10Prod = 0.0; + for (int i = 0; i < log10p.length; i++) { + if ( log10p[i] > 1e-18 ) + throw new IllegalArgumentException("log10p: Log-probability must be <= 0"); + log10Prod += log10p[i] * k[i]; + } + return log10MultinomialCoefficient(n, k) + log10Prod; + } + + /** + * Computes a multinomial coefficient efficiently avoiding overflow even for large numbers. + * This is computed using the formula: + *

+ * M(x1,x2,...,xk; n) = [ n! / (x1! x2! ... xk!) ] + *

+ * where xi represents the number of times outcome i was observed, n is the number of total observations. + * In this implementation, the value of n is inferred as the sum over i of xi. + * + * @param k an int[] of counts, where each element represents the number of times a certain outcome was observed + * @return the multinomial of the specified configuration. + */ + public static double multinomialCoefficient(final int[] k) { + int n = 0; + for (int xi : k) { + n += xi; + } + + return Math.pow(10, log10MultinomialCoefficient(n, k)); + } + + /** + * Computes a multinomial probability efficiently avoiding overflow even for large numbers. + * This is computed using the formula: + *

+ * M(x1,x2,...,xk; n; p1,p2,...,pk) = [ n! / (x1! x2! ... xk!) ] (p1^x1)(p2^x2)(...)(pk^xk) + *

+ * where xi represents the number of times outcome i was observed, n is the number of total observations, and + * pi represents the probability of the i-th outcome to occur. In this implementation, the value of n is + * inferred as the sum over i of xi. + * + * @param k an int[] of counts, where each element represents the number of times a certain outcome was observed + * @param p a double[] of probabilities, where each element represents the probability a given outcome can occur + * @return the multinomial probability of the specified configuration. + */ + public static double multinomialProbability(final int[] k, final double[] p) { + if (p.length != k.length) + throw new IllegalArgumentException("p and k: Array of log10 probabilities must have the same size as the array of number of sucesses: " + p.length + ", " + k.length); + + int n = 0; + double[] log10P = new double[p.length]; + for (int i = 0; i < p.length; i++) { + log10P[i] = Math.log10(p[i]); + n += k[i]; + } + return Math.pow(10, log10MultinomialProbability(n, k, log10P)); + } + + /** + * calculate the Root Mean Square of an array of integers + * + * @param x an byte[] of numbers + * @return the RMS of the specified numbers. + */ + public static double rms(final byte[] x) { + if (x.length == 0) + return 0.0; + + double rms = 0.0; + for (int i : x) + rms += i * i; + rms /= x.length; + return Math.sqrt(rms); + } + + /** + * calculate the Root Mean Square of an array of integers + * + * @param x an int[] of numbers + * @return the RMS of the specified numbers. + */ + public static double rms(final int[] x) { + if (x.length == 0) + return 0.0; + + double rms = 0.0; + for (int i : x) + rms += i * i; + rms /= x.length; + return Math.sqrt(rms); + } + + /** + * calculate the Root Mean Square of an array of doubles + * + * @param x a double[] of numbers + * @return the RMS of the specified numbers. + */ + public static double rms(final Double[] x) { + if (x.length == 0) + return 0.0; + + double rms = 0.0; + for (Double i : x) + rms += i * i; + rms /= x.length; + return Math.sqrt(rms); + } + + public static double rms(final Collection l) { + if (l.size() == 0) + return 0.0; + + double rms = 0.0; + for (int i : l) + rms += i * i; + rms /= l.size(); + return Math.sqrt(rms); + } + + public static double distanceSquared(final double[] x, final double[] y) { + double dist = 0.0; + for (int iii = 0; iii < x.length; iii++) { + dist += (x[iii] - y[iii]) * (x[iii] - y[iii]); + } + return dist; + } + + public static double round(final double num, final int digits) { + double result = num * Math.pow(10.0, (double) digits); + result = Math.round(result); + result = result / Math.pow(10.0, (double) digits); + return result; + } + + /** + * normalizes the log10-based array. ASSUMES THAT ALL ARRAY ENTRIES ARE <= 0 (<= 1 IN REAL-SPACE). + * + * @param array the array to be normalized + * @param takeLog10OfOutput if true, the output will be transformed back into log10 units + * @return a newly allocated array corresponding the normalized values in array, maybe log10 transformed + */ + public static double[] normalizeFromLog10(final double[] array, final boolean takeLog10OfOutput) { + return normalizeFromLog10(array, takeLog10OfOutput, false); + } + + /** + * See #normalizeFromLog10 but with the additional option to use an approximation that keeps the calculation always in log-space + * + * @param array + * @param takeLog10OfOutput + * @param keepInLogSpace + * + * @return + */ + public static double[] normalizeFromLog10(final double[] array, final boolean takeLog10OfOutput, final boolean keepInLogSpace) { + // for precision purposes, we need to add (or really subtract, since they're + // all negative) the largest value; also, we need to convert to normal-space. + double maxValue = arrayMax(array); + + // we may decide to just normalize in log space without converting to linear space + if (keepInLogSpace) { + for (int i = 0; i < array.length; i++) { + array[i] -= maxValue; + } + return array; + } + + // default case: go to linear space + double[] normalized = new double[array.length]; + + for (int i = 0; i < array.length; i++) + normalized[i] = Math.pow(10, array[i] - maxValue); + + // normalize + double sum = 0.0; + for (int i = 0; i < array.length; i++) + sum += normalized[i]; + for (int i = 0; i < array.length; i++) { + double x = normalized[i] / sum; + if (takeLog10OfOutput) { + x = Math.log10(x); + if ( x < LOG10_P_OF_ZERO || Double.isInfinite(x) ) + x = array[i] - maxValue; + } + + normalized[i] = x; + } + + return normalized; + } + + /** + * normalizes the log10-based array. ASSUMES THAT ALL ARRAY ENTRIES ARE <= 0 (<= 1 IN REAL-SPACE). + * + * @param array the array to be normalized + * @return a newly allocated array corresponding the normalized values in array + */ + public static double[] normalizeFromLog10(final double[] array) { + return normalizeFromLog10(array, false); + } + + /** + * normalizes the real-space probability array. + * + * Does not assume anything about the values in the array, beyond that no elements are below 0. It's ok + * to have values in the array of > 1, or have the sum go above 0. + * + * @param array the array to be normalized + * @return a newly allocated array corresponding the normalized values in array + */ + @Requires("array != null") + @Ensures({"result != null"}) + public static double[] normalizeFromRealSpace(final double[] array) { + if ( array.length == 0 ) + return array; + + final double sum = sum(array); + final double[] normalized = new double[array.length]; + if ( sum < 0.0 ) throw new IllegalArgumentException("Values in probability array sum to a negative number " + sum); + for ( int i = 0; i < array.length; i++ ) { + normalized[i] = array[i] / sum; + } + return normalized; + } + + public static int maxElementIndex(final double[] array) { + return maxElementIndex(array, array.length); + } + + public static int maxElementIndex(final double[] array, final int endIndex) { + if (array == null || array.length == 0) + throw new IllegalArgumentException("Array cannot be null!"); + + int maxI = 0; + for (int i = 1; i < endIndex; i++) { + if (array[i] > array[maxI]) + maxI = i; + } + + return maxI; + } + + public static int maxElementIndex(final int[] array) { + return maxElementIndex(array, array.length); + } + + public static int maxElementIndex(final byte[] array) { + return maxElementIndex(array, array.length); + } + + public static int maxElementIndex(final int[] array, final int endIndex) { + if (array == null || array.length == 0) + throw new IllegalArgumentException("Array cannot be null!"); + + int maxI = 0; + for (int i = 1; i < endIndex; i++) { + if (array[i] > array[maxI]) + maxI = i; + } + + return maxI; + } + + public static int maxElementIndex(final byte[] array, final int endIndex) { + if (array == null || array.length == 0) + throw new IllegalArgumentException("Array cannot be null!"); + + int maxI = 0; + for (int i = 1; i < endIndex; i++) { + if (array[i] > array[maxI]) + maxI = i; + } + + return maxI; + } + + public static int arrayMax(final int[] array) { + return array[maxElementIndex(array)]; + } + + + public static double arrayMax(final double[] array) { + return array[maxElementIndex(array)]; + } + + public static double arrayMax(final double[] array, final int endIndex) { + return array[maxElementIndex(array, endIndex)]; + } + + public static double arrayMin(final double[] array) { + return array[minElementIndex(array)]; + } + + public static int arrayMin(final int[] array) { + return array[minElementIndex(array)]; + } + + public static byte arrayMin(final byte[] array) { + return array[minElementIndex(array)]; + } + + /** + * Compute the min element of a List + * @param array a non-empty list of integer + * @return the min + */ + public static int arrayMin(final List array) { + if ( array == null || array.isEmpty() ) throw new IllegalArgumentException("Array must be non-null and non-empty"); + int min = array.get(0); + for ( final int i : array ) + if ( i < min ) min = i; + return min; + } + + /** + * Compute the median element of the list of integers + * @param array a list of integers + * @return the median element + */ + public static > T median(final List array) { + /* TODO -- from Valentin + the current implementation is not the usual median when the input is of even length. More concretely it returns the ith element of the list where i = floor(input.size() / 2). + + But actually that is not the "usual" definition of a median, as it is supposed to return the average of the two middle values when the sample length is an even number (i.e. median(1,2,3,4,5,6) == 3.5). [Sources: R and wikipedia] + + My suggestion for a solution is then: + + unify median and medianDoubles to public static T median(Collection) + check on null elements and throw an exception if there are any or perhaps return a null; documented in the javadoc. + relocate, rename and refactor MathUtils.median(X) to Utils.ithElement(X,X.size()/2) + In addition, the current median implementation sorts the whole input list witch is O(n log n). However find out the ith element (thus calculate the median) can be done in O(n) + */ + if ( array == null ) throw new IllegalArgumentException("Array must be non-null"); + final int size = array.size(); + if ( size == 0 ) throw new IllegalArgumentException("Array cannot have size 0"); + else if ( size == 1 ) return array.get(0); + else { + final ArrayList sorted = new ArrayList<>(array); + Collections.sort(sorted); + return sorted.get(size / 2); + } + } + + public static int minElementIndex(final double[] array) { + if (array == null || array.length == 0) + throw new IllegalArgumentException("Array cannot be null!"); + + int minI = 0; + for (int i = 1; i < array.length; i++) { + if (array[i] < array[minI]) + minI = i; + } + + return minI; + } + + public static int minElementIndex(final byte[] array) { + if (array == null || array.length == 0) + throw new IllegalArgumentException("Array cannot be null!"); + + int minI = 0; + for (int i = 1; i < array.length; i++) { + if (array[i] < array[minI]) + minI = i; + } + + return minI; + } + + public static int minElementIndex(final int[] array) { + if (array == null || array.length == 0) + throw new IllegalArgumentException("Array cannot be null!"); + + int minI = 0; + for (int i = 1; i < array.length; i++) { + if (array[i] < array[minI]) + minI = i; + } + + return minI; + } + + public static int arrayMaxInt(final List array) { + if (array == null) + throw new IllegalArgumentException("Array cannot be null!"); + if (array.size() == 0) + throw new IllegalArgumentException("Array size cannot be 0!"); + + int m = array.get(0); + for (int e : array) + m = Math.max(m, e); + return m; + } + + public static int sum(final List list ) { + int sum = 0; + for ( Integer i : list ) { + sum += i; + } + return sum; + } + + public static double average(final List vals, final int maxI) { + long sum = 0L; + + int i = 0; + for (long x : vals) { + if (i > maxI) + break; + sum += x; + i++; + } + + return (1.0 * sum) / i; + } + + public static double average(final List vals) { + return average(vals, vals.size()); + } + + public static int countOccurrences(final char c, final String s) { + int count = 0; + for (int i = 0; i < s.length(); i++) { + count += s.charAt(i) == c ? 1 : 0; + } + return count; + } + + public static int countOccurrences(T x, List l) { + int count = 0; + for (T y : l) { + if (x.equals(y)) + count++; + } + + return count; + } + + public static int countOccurrences(byte element, byte[] array) { + int count = 0; + for (byte y : array) { + if (element == y) + count++; + } + + return count; + } + + public static int countOccurrences(final boolean element, final boolean[] array) { + int count = 0; + for (final boolean b : array) { + if (element == b) + count++; + } + + return count; + } + + + /** + * Returns n random indices drawn with replacement from the range 0..(k-1) + * + * @param n the total number of indices sampled from + * @param k the number of random indices to draw (with replacement) + * @return a list of k random indices ranging from 0 to (n-1) with possible duplicates + */ + static public ArrayList sampleIndicesWithReplacement(final int n, final int k) { + + ArrayList chosen_balls = new ArrayList(k); + for (int i = 0; i < k; i++) { + //Integer chosen_ball = balls[rand.nextInt(k)]; + chosen_balls.add(GenomeAnalysisEngine.getRandomGenerator().nextInt(n)); + //balls.remove(chosen_ball); + } + + return chosen_balls; + } + + /** + * Returns n random indices drawn without replacement from the range 0..(k-1) + * + * @param n the total number of indices sampled from + * @param k the number of random indices to draw (without replacement) + * @return a list of k random indices ranging from 0 to (n-1) without duplicates + */ + static public ArrayList sampleIndicesWithoutReplacement(final int n, final int k) { + ArrayList chosen_balls = new ArrayList(k); + + for (int i = 0; i < n; i++) { + chosen_balls.add(i); + } + + Collections.shuffle(chosen_balls, GenomeAnalysisEngine.getRandomGenerator()); + + //return (ArrayList) chosen_balls.subList(0, k); + return new ArrayList(chosen_balls.subList(0, k)); + } + + /** + * Given a list of indices into a list, return those elements of the list with the possibility of drawing list elements multiple times + * + * @param indices the list of indices for elements to extract + * @param list the list from which the elements should be extracted + * @param the template type of the ArrayList + * @return a new ArrayList consisting of the elements at the specified indices + */ + static public ArrayList sliceListByIndices(final List indices, final List list) { + ArrayList subset = new ArrayList(); + + for (int i : indices) { + subset.add(list.get(i)); + } + + return subset; + } + + /** + * Given two log-probability vectors, compute log of vector product of them: + * in Matlab notation, return log10(10.*x'*10.^y) + * @param x vector 1 + * @param y vector 2 + * @return a double representing log (dotProd(10.^x,10.^y) + */ + public static double logDotProduct(final double [] x, final double[] y) { + if (x.length != y.length) + throw new ReviewedStingException("BUG: Vectors of different lengths"); + + double tmpVec[] = new double[x.length]; + + for (int k=0; k < tmpVec.length; k++ ) { + tmpVec[k] = x[k]+y[k]; + } + + return log10sumLog10(tmpVec); + + + + } + + /** + * Check that the log10 prob vector vector is well formed + * + * @param vector + * @param expectedSize + * @param shouldSumToOne + * + * @return true if vector is well-formed, false otherwise + */ + public static boolean goodLog10ProbVector(final double[] vector, final int expectedSize, final boolean shouldSumToOne) { + if ( vector.length != expectedSize ) return false; + + for ( final double pr : vector ) { + if ( ! goodLog10Probability(pr) ) + return false; + } + + if ( shouldSumToOne && compareDoubles(sumLog10(vector), 1.0, 1e-4) != 0 ) + return false; + + return true; // everything is good + } + + /** + * Checks that the result is a well-formed log10 probability + * + * @param result a supposedly well-formed log10 probability value. By default allows + * -Infinity values, as log10(0.0) == -Infinity. + * @return true if result is really well formed + */ + public static boolean goodLog10Probability(final double result) { + return goodLog10Probability(result, true); + } + + /** + * Checks that the result is a well-formed log10 probability + * + * @param result a supposedly well-formed log10 probability value + * @param allowNegativeInfinity should we consider a -Infinity value ok? + * @return true if result is really well formed + */ + public static boolean goodLog10Probability(final double result, final boolean allowNegativeInfinity) { + return result <= 0.0 && result != Double.POSITIVE_INFINITY && (allowNegativeInfinity || result != Double.NEGATIVE_INFINITY) && ! Double.isNaN(result); + } + + /** + * Checks that the result is a well-formed probability + * + * @param result a supposedly well-formed probability value + * @return true if result is really well formed + */ + public static boolean goodProbability(final double result) { + return result >= 0.0 && result <= 1.0 && ! Double.isInfinite(result) && ! Double.isNaN(result); + } + + /** + * A utility class that computes on the fly average and standard deviation for a stream of numbers. + * The number of observations does not have to be known in advance, and can be also very big (so that + * it could overflow any naive summation-based scheme or cause loss of precision). + * Instead, adding a new number observed + * to a sample with add(observed) immediately updates the instance of this object so that + * it contains correct mean and standard deviation for all the numbers seen so far. Source: Knuth, vol.2 + * (see also e.g. http://www.johndcook.com/standard_deviation.html for online reference). + */ + public static class RunningAverage { + private double mean = 0.0; + private double s = 0.0; + private long obs_count = 0; + + public void add(double obs) { + obs_count++; + double oldMean = mean; + mean += (obs - mean) / obs_count; // update mean + s += (obs - oldMean) * (obs - mean); + } + + public void addAll(Collection col) { + for (Number o : col) { + add(o.doubleValue()); + } + } + + public double mean() { + return mean; + } + + public double stddev() { + return Math.sqrt(s / (obs_count - 1)); + } + + public double var() { + return s / (obs_count - 1); + } + + public long observationCount() { + return obs_count; + } + + public RunningAverage clone() { + RunningAverage ra = new RunningAverage(); + ra.mean = this.mean; + ra.s = this.s; + ra.obs_count = this.obs_count; + return ra; + } + + public void merge(RunningAverage other) { + if (this.obs_count > 0 || other.obs_count > 0) { // if we have any observations at all + this.mean = (this.mean * this.obs_count + other.mean * other.obs_count) / (this.obs_count + other.obs_count); + this.s += other.s; + } + this.obs_count += other.obs_count; + } + } + + // + // useful common utility routines + // + + static public double max(double x0, double x1, double x2) { + double a = Math.max(x0, x1); + return Math.max(a, x2); + } + + /** + * Converts LN to LOG10 + * + * @param ln log(x) + * @return log10(x) + */ + public static double lnToLog10(final double ln) { + return ln * Math.log10(Math.E); + } + + /** + * Constants to simplify the log gamma function calculation. + */ + private static final double zero = 0.0, one = 1.0, half = .5, a0 = 7.72156649015328655494e-02, a1 = 3.22467033424113591611e-01, a2 = 6.73523010531292681824e-02, a3 = 2.05808084325167332806e-02, a4 = 7.38555086081402883957e-03, a5 = 2.89051383673415629091e-03, a6 = 1.19270763183362067845e-03, a7 = 5.10069792153511336608e-04, a8 = 2.20862790713908385557e-04, a9 = 1.08011567247583939954e-04, a10 = 2.52144565451257326939e-05, a11 = 4.48640949618915160150e-05, tc = 1.46163214496836224576e+00, tf = -1.21486290535849611461e-01, tt = -3.63867699703950536541e-18, t0 = 4.83836122723810047042e-01, t1 = -1.47587722994593911752e-01, t2 = 6.46249402391333854778e-02, t3 = -3.27885410759859649565e-02, t4 = 1.79706750811820387126e-02, t5 = -1.03142241298341437450e-02, t6 = 6.10053870246291332635e-03, t7 = -3.68452016781138256760e-03, t8 = 2.25964780900612472250e-03, t9 = -1.40346469989232843813e-03, t10 = 8.81081882437654011382e-04, t11 = -5.38595305356740546715e-04, t12 = 3.15632070903625950361e-04, t13 = -3.12754168375120860518e-04, t14 = 3.35529192635519073543e-04, u0 = -7.72156649015328655494e-02, u1 = 6.32827064025093366517e-01, u2 = 1.45492250137234768737e+00, u3 = 9.77717527963372745603e-01, u4 = 2.28963728064692451092e-01, u5 = 1.33810918536787660377e-02, v1 = 2.45597793713041134822e+00, v2 = 2.12848976379893395361e+00, v3 = 7.69285150456672783825e-01, v4 = 1.04222645593369134254e-01, v5 = 3.21709242282423911810e-03, s0 = -7.72156649015328655494e-02, s1 = 2.14982415960608852501e-01, s2 = 3.25778796408930981787e-01, s3 = 1.46350472652464452805e-01, s4 = 2.66422703033638609560e-02, s5 = 1.84028451407337715652e-03, s6 = 3.19475326584100867617e-05, r1 = 1.39200533467621045958e+00, r2 = 7.21935547567138069525e-01, r3 = 1.71933865632803078993e-01, r4 = 1.86459191715652901344e-02, r5 = 7.77942496381893596434e-04, r6 = 7.32668430744625636189e-06, w0 = 4.18938533204672725052e-01, w1 = 8.33333333333329678849e-02, w2 = -2.77777777728775536470e-03, w3 = 7.93650558643019558500e-04, w4 = -5.95187557450339963135e-04, w5 = 8.36339918996282139126e-04, w6 = -1.63092934096575273989e-03; + + /** + * Efficient rounding functions to simplify the log gamma function calculation + * double to long with 32 bit shift + */ + private static final int HI(final double x) { + return (int) (Double.doubleToLongBits(x) >> 32); + } + + /** + * Efficient rounding functions to simplify the log gamma function calculation + * double to long without shift + */ + private static final int LO(final double x) { + return (int) Double.doubleToLongBits(x); + } + + /** + * Most efficent implementation of the lnGamma (FDLIBM) + * Use via the log10Gamma wrapper method. + */ + private static double lnGamma(final double x) { + double t, y, z, p, p1, p2, p3, q, r, w; + int i; + + int hx = HI(x); + int lx = LO(x); + + /* purge off +-inf, NaN, +-0, and negative arguments */ + int ix = hx & 0x7fffffff; + if (ix >= 0x7ff00000) + return Double.POSITIVE_INFINITY; + if ((ix | lx) == 0 || hx < 0) + return Double.NaN; + if (ix < 0x3b900000) { /* |x|<2**-70, return -log(|x|) */ + return -Math.log(x); + } + + /* purge off 1 and 2 */ + if ((((ix - 0x3ff00000) | lx) == 0) || (((ix - 0x40000000) | lx) == 0)) + r = 0; + /* for x < 2.0 */ + else if (ix < 0x40000000) { + if (ix <= 0x3feccccc) { /* lgamma(x) = lgamma(x+1)-log(x) */ + r = -Math.log(x); + if (ix >= 0x3FE76944) { + y = one - x; + i = 0; + } + else if (ix >= 0x3FCDA661) { + y = x - (tc - one); + i = 1; + } + else { + y = x; + i = 2; + } + } + else { + r = zero; + if (ix >= 0x3FFBB4C3) { + y = 2.0 - x; + i = 0; + } /* [1.7316,2] */ + else if (ix >= 0x3FF3B4C4) { + y = x - tc; + i = 1; + } /* [1.23,1.73] */ + else { + y = x - one; + i = 2; + } + } + + switch (i) { + case 0: + z = y * y; + p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10)))); + p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11))))); + p = y * p1 + p2; + r += (p - 0.5 * y); + break; + case 1: + z = y * y; + w = z * y; + p1 = t0 + w * (t3 + w * (t6 + w * (t9 + w * t12))); /* parallel comp */ + p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13))); + p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14))); + p = z * p1 - (tt - w * (p2 + y * p3)); + r += (tf + p); + break; + case 2: + p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5))))); + p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5)))); + r += (-0.5 * y + p1 / p2); + } + } + else if (ix < 0x40200000) { /* x < 8.0 */ + i = (int) x; + t = zero; + y = x - (double) i; + p = y * (s0 + y * (s1 + y * (s2 + y * (s3 + y * (s4 + y * (s5 + y * s6)))))); + q = one + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6))))); + r = half * y + p / q; + z = one; /* lgamma(1+s) = log(s) + lgamma(s) */ + switch (i) { + case 7: + z *= (y + 6.0); /* FALLTHRU */ + case 6: + z *= (y + 5.0); /* FALLTHRU */ + case 5: + z *= (y + 4.0); /* FALLTHRU */ + case 4: + z *= (y + 3.0); /* FALLTHRU */ + case 3: + z *= (y + 2.0); /* FALLTHRU */ + r += Math.log(z); + break; + } + /* 8.0 <= x < 2**58 */ + } + else if (ix < 0x43900000) { + t = Math.log(x); + z = one / x; + y = z * z; + w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6))))); + r = (x - half) * (t - one) + w; + } + else + /* 2**58 <= x <= inf */ + r = x * (Math.log(x) - one); + return r; + } + + /** + * Calculates the log10 of the gamma function for x using the efficient FDLIBM + * implementation to avoid overflows and guarantees high accuracy even for large + * numbers. + * + * @param x the x parameter + * @return the log10 of the gamma function at x. + */ + public static double log10Gamma(final double x) { + return lnToLog10(lnGamma(x)); + } + + public static double factorial(final int x) { + // avoid rounding errors caused by fact that 10^log(x) might be slightly lower than x and flooring may produce 1 less than real value + return (double)Math.round(Math.pow(10, log10Factorial(x))); + } + + public static double log10Factorial(final int x) { + if (x >= log10FactorialCache.length || x < 0) + return log10Gamma(x + 1); + else + return log10FactorialCache[x]; + } + + /** + * Adds two arrays together and returns a new array with the sum. + * + * @param a one array + * @param b another array + * @return a new array with the sum of a and b + */ + @Requires("a.length == b.length") + @Ensures("result.length == a.length") + public static int[] addArrays(final int[] a, final int[] b) { + int[] c = new int[a.length]; + for (int i = 0; i < a.length; i++) + c[i] = a[i] + b[i]; + return c; + } + + /** Same routine, unboxed types for efficiency + * + * @param x First vector + * @param y Second vector + * @return Vector of same length as x and y so that z[k] = x[k]+y[k] + */ + public static double[] vectorSum(final double[]x, final double[] y) { + if (x.length != y.length) + throw new ReviewedStingException("BUG: Lengths of x and y must be the same"); + + double[] result = new double[x.length]; + for (int k=0; k log10LinearRange(final int start, final int stop, final double eps) { + final LinkedList values = new LinkedList<>(); + final double log10range = Math.log10(stop - start); + + if ( start == 0 ) + values.add(0); + + double i = 0.0; + while ( i <= log10range ) { + final int index = (int)Math.round(Math.pow(10, i)) + start; + if ( index < stop && (values.peekLast() == null || values.peekLast() != index ) ) + values.add(index); + i += eps; + } + + if ( values.peekLast() == null || values.peekLast() != stop ) + values.add(stop); + + return values; + } + + /** + * Compute in a numerical correct way the quantity log10(1-x) + * + * Uses the approximation log10(1-x) = log10(1/x - 1) + log10(x) to avoid very quick underflow + * in 1-x when x is very small + * + * @param x a positive double value between 0.0 and 1.0 + * @return an estimate of log10(1-x) + */ + @Requires("x >= 0.0 && x <= 1.0") + @Ensures("result <= 0.0") + public static double log10OneMinusX(final double x) { + if ( x == 1.0 ) + return Double.NEGATIVE_INFINITY; + else if ( x == 0.0 ) + return 0.0; + else { + final double d = Math.log10(1 / x - 1) + Math.log10(x); + return Double.isInfinite(d) || d > 0.0 ? 0.0 : d; + } + } + + /** + * Draw N random elements from list + * @param list - the list from which to draw randomly + * @param N - the number of elements to draw + */ + public static List randomSubset(final List list, final int N) { + if (list.size() <= N) { + return list; + } + + return sliceListByIndices(sampleIndicesWithoutReplacement(list.size(),N),list); + } + + /** + * Draw N random elements from list with replacement + * @param list - the list from which to draw randomly + * @param N - the number of elements to draw + */ + public static List randomSample(final List list, final int N) { + return sliceListByIndices(sampleIndicesWithReplacement(list.size(),N),list); + } + + /** + * Return the likelihood of observing the counts of categories having sampled a population + * whose categorial frequencies are distributed according to a Dirichlet distribution + * @param dirichletParams - params of the prior dirichlet distribution + * @param dirichletSum - the sum of those parameters + * @param counts - the counts of observation in each category + * @param countSum - the sum of counts (number of trials) + * @return - associated likelihood + */ + public static double dirichletMultinomial(final double[] dirichletParams, final double dirichletSum, + final int[] counts, final int countSum) { + if ( dirichletParams.length != counts.length ) { + throw new IllegalStateException("The number of dirichlet parameters must match the number of categories"); + } + // todo -- lots of lnGammas here. At some point we can safely switch to x * ( ln(x) - 1) + double likelihood = log10MultinomialCoefficient(countSum,counts); + likelihood += log10Gamma(dirichletSum); + likelihood -= log10Gamma(dirichletSum+countSum); + for ( int idx = 0; idx < counts.length; idx++ ) { + likelihood += log10Gamma(counts[idx] + dirichletParams[idx]); + likelihood -= log10Gamma(dirichletParams[idx]); + } + + return likelihood; + } + + public static double dirichletMultinomial(double[] params, int[] counts) { + return dirichletMultinomial(params,sum(params),counts,(int) sum(counts)); + } + + public static ExponentialDistribution exponentialDistribution( final double mean ) { + return new ExponentialDistributionImpl(mean); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/Median.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/Median.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/Median.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/Median.java diff --git a/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/MendelianViolation.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/MendelianViolation.java diff --git a/public/java/src/org/broadinstitute/sting/utils/MultiThreadedErrorTracker.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/MultiThreadedErrorTracker.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/MultiThreadedErrorTracker.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/MultiThreadedErrorTracker.java diff --git a/public/java/src/org/broadinstitute/sting/utils/NGSPlatform.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/NGSPlatform.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/NGSPlatform.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/NGSPlatform.java diff --git a/public/java/src/org/broadinstitute/sting/utils/PathUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/PathUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/PathUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/PathUtils.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/QualityUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/QualityUtils.java new file mode 100644 index 000000000..543923dd6 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/QualityUtils.java @@ -0,0 +1,397 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils; + +import com.google.java.contract.Ensures; +import net.sf.samtools.SAMUtils; + +/** + * QualityUtils is a static class (no instantiation allowed!) with some utility methods for manipulating + * quality scores. + * + * @author Kiran Garimella, Mark DePristo + * @since Way back + */ +public class QualityUtils { + /** + * Maximum quality score that can be encoded in a SAM/BAM file + */ + public final static byte MAX_SAM_QUAL_SCORE = SAMUtils.MAX_PHRED_SCORE; + + + private final static double RAW_MIN_PHRED_SCALED_QUAL = Math.log10(Double.MIN_VALUE); + protected final static double MIN_PHRED_SCALED_QUAL = -10.0 * RAW_MIN_PHRED_SCALED_QUAL; + + /** + * bams containing quals above this value are extremely suspicious and we should warn the user + */ + public final static byte MAX_REASONABLE_Q_SCORE = 60; + + /** + * The lowest quality score for a base that is considered reasonable for statistical analysis. This is + * because Q 6 => you stand a 25% of being right, which means all bases are equally likely + */ + public final static byte MIN_USABLE_Q_SCORE = 6; + public final static int MAPPING_QUALITY_UNAVAILABLE = 255; + + /** + * Maximum sense quality value. + */ + public static final int MAX_QUAL = 254; + + /** + * Cached values for qual as byte calculations so they are very fast + */ + private static double qualToErrorProbCache[] = new double[MAX_QUAL + 1]; + private static double qualToProbLog10Cache[] = new double[MAX_QUAL + 1]; + + + static { + for (int i = 0; i <= MAX_QUAL; i++) { + qualToErrorProbCache[i] = qualToErrorProb((double) i); + qualToProbLog10Cache[i] = Math.log10(1.0 - qualToErrorProbCache[i]); + } + } + + /** + * Private constructor. No instantiating this class! + */ + private QualityUtils() {} + + // ---------------------------------------------------------------------- + // + // These are all functions to convert a phred-scaled quality score to a probability + // + // ---------------------------------------------------------------------- + + /** + * Convert a phred-scaled quality score to its probability of being true (Q30 => 0.999) + * + * This is the Phred-style conversion, *not* the Illumina-style conversion. + * + * Because the input is a discretized byte value, this function uses a cache so is very efficient + * + * WARNING -- because this function takes a byte for maxQual, you must be careful in converting + * integers to byte. The appropriate way to do this is ((byte)(myInt & 0xFF)) + * + * @param qual a quality score (0-255) + * @return a probability (0.0-1.0) + */ + @Ensures("result >= 0.0 && result <= 1.0") + public static double qualToProb(final byte qual) { + return 1.0 - qualToErrorProb(qual); + } + + /** + * Convert a phred-scaled quality score to its probability of being true (Q30 => 0.999) + * + * This is the Phred-style conversion, *not* the Illumina-style conversion. + * + * Because the input is a double value, this function must call Math.pow so can be quite expensive + * + * @param qual a phred-scaled quality score encoded as a double. Can be non-integer values (30.5) + * @return a probability (0.0-1.0) + */ + @Ensures("result >= 0.0 && result <= 1.0") + public static double qualToProb(final double qual) { + if ( qual < 0.0 ) throw new IllegalArgumentException("qual must be >= 0.0 but got " + qual); + return 1.0 - qualToErrorProb(qual); + } + + /** + * Convert a phred-scaled quality score to its log10 probability of being true (Q30 => log10(0.999)) + * + * This is the Phred-style conversion, *not* the Illumina-style conversion. + * + * Because the input is a double value, this function must call Math.pow so can be quite expensive + * + * WARNING -- because this function takes a byte for maxQual, you must be careful in converting + * integers to byte. The appropriate way to do this is ((byte)(myInt & 0xFF)) + * + * @param qual a phred-scaled quality score encoded as a double. Can be non-integer values (30.5) + * @return a probability (0.0-1.0) + */ + @Ensures("result <= 0.0") + public static double qualToProbLog10(final byte qual) { + return qualToProbLog10Cache[(int)qual & 0xff]; // Map: 127 -> 127; -128 -> 128; -1 -> 255; etc. + } + + /** + * Convert a phred-scaled quality score to its probability of being wrong (Q30 => 0.001) + * + * This is the Phred-style conversion, *not* the Illumina-style conversion. + * + * Because the input is a double value, this function must call Math.pow so can be quite expensive + * + * @param qual a phred-scaled quality score encoded as a double. Can be non-integer values (30.5) + * @return a probability (0.0-1.0) + */ + @Ensures("result >= 0.0 && result <= 1.0") + public static double qualToErrorProb(final double qual) { + if ( qual < 0.0 ) throw new IllegalArgumentException("qual must be >= 0.0 but got " + qual); + return Math.pow(10.0, qual / -10.0); + } + + /** + * Convert a phred-scaled quality score to its probability of being wrong (Q30 => 0.001) + * + * This is the Phred-style conversion, *not* the Illumina-style conversion. + * + * Because the input is a byte value, this function uses a cache so is very efficient + * + * WARNING -- because this function takes a byte for maxQual, you must be careful in converting + * integers to byte. The appropriate way to do this is ((byte)(myInt & 0xFF)) + * + * @param qual a phred-scaled quality score encoded as a byte + * @return a probability (0.0-1.0) + */ + @Ensures("result >= 0.0 && result <= 1.0") + public static double qualToErrorProb(final byte qual) { + return qualToErrorProbCache[(int)qual & 0xff]; // Map: 127 -> 127; -128 -> 128; -1 -> 255; etc. + } + + + /** + * Convert a phred-scaled quality score to its log10 probability of being wrong (Q30 => log10(0.001)) + * + * This is the Phred-style conversion, *not* the Illumina-style conversion. + * + * The calculation is extremely efficient + * + * WARNING -- because this function takes a byte for maxQual, you must be careful in converting + * integers to byte. The appropriate way to do this is ((byte)(myInt & 0xFF)) + * + * @param qual a phred-scaled quality score encoded as a byte + * @return a probability (0.0-1.0) + */ + @Ensures("result <= 0.0") + public static double qualToErrorProbLog10(final byte qual) { + return qualToErrorProbLog10((double)(qual & 0xFF)); + } + + /** + * Convert a phred-scaled quality score to its log10 probability of being wrong (Q30 => log10(0.001)) + * + * This is the Phred-style conversion, *not* the Illumina-style conversion. + * + * The calculation is extremely efficient + * + * @param qual a phred-scaled quality score encoded as a double + * @return a probability (0.0-1.0) + */ + @Ensures("result <= 0.0") + public static double qualToErrorProbLog10(final double qual) { + if ( qual < 0.0 ) throw new IllegalArgumentException("qual must be >= 0.0 but got " + qual); + return qual / -10.0; + } + + // ---------------------------------------------------------------------- + // + // Functions to convert a probability to a phred-scaled quality score + // + // ---------------------------------------------------------------------- + + /** + * Convert a probability of being wrong to a phred-scaled quality score (0.01 => 20). + * + * Note, this function caps the resulting quality score by the public static value MAX_SAM_QUAL_SCORE + * and by 1 at the low-end. + * + * @param errorRate a probability (0.0-1.0) of being wrong (i.e., 0.01 is 1% change of being wrong) + * @return a quality score (0-MAX_SAM_QUAL_SCORE) + */ + public static byte errorProbToQual(final double errorRate) { + return errorProbToQual(errorRate, MAX_SAM_QUAL_SCORE); + } + + /** + * Convert a probability of being wrong to a phred-scaled quality score (0.01 => 20). + * + * Note, this function caps the resulting quality score by the public static value MIN_REASONABLE_ERROR + * and by 1 at the low-end. + * + * WARNING -- because this function takes a byte for maxQual, you must be careful in converting + * integers to byte. The appropriate way to do this is ((byte)(myInt & 0xFF)) + * + * @param errorRate a probability (0.0-1.0) of being wrong (i.e., 0.01 is 1% change of being wrong) + * @return a quality score (0-maxQual) + */ + public static byte errorProbToQual(final double errorRate, final byte maxQual) { + if ( ! MathUtils.goodProbability(errorRate) ) throw new IllegalArgumentException("errorRate must be good probability but got " + errorRate); + final double d = Math.round(-10.0*Math.log10(errorRate)); + return boundQual((int)d, maxQual); + } + + /** + * @see #errorProbToQual(double, byte) with proper conversion of maxQual integer to a byte + */ + public static byte errorProbToQual(final double prob, final int maxQual) { + if ( maxQual < 0 || maxQual > 255 ) throw new IllegalArgumentException("maxQual must be between 0-255 but got " + maxQual); + return errorProbToQual(prob, (byte)(maxQual & 0xFF)); + } + + /** + * Convert a probability of being right to a phred-scaled quality score (0.99 => 20). + * + * Note, this function caps the resulting quality score by the public static value MAX_SAM_QUAL_SCORE + * and by 1 at the low-end. + * + * @param prob a probability (0.0-1.0) of being right + * @return a quality score (0-MAX_SAM_QUAL_SCORE) + */ + public static byte trueProbToQual(final double prob) { + return trueProbToQual(prob, MAX_SAM_QUAL_SCORE); + } + + /** + * Convert a probability of being right to a phred-scaled quality score (0.99 => 20). + * + * Note, this function caps the resulting quality score by the min probability allowed (EPS). + * So for example, if prob is 1e-6, which would imply a Q-score of 60, and EPS is 1e-4, + * the result of this function is actually Q40. + * + * Note that the resulting quality score, regardless of EPS, is capped by MAX_SAM_QUAL_SCORE and + * bounded on the low-side by 1. + * + * WARNING -- because this function takes a byte for maxQual, you must be careful in converting + * integers to byte. The appropriate way to do this is ((byte)(myInt & 0xFF)) + * + * @param trueProb a probability (0.0-1.0) of being right + * @param maxQual the maximum quality score we are allowed to emit here, regardless of the error rate + * @return a phred-scaled quality score (0-maxQualScore) as a byte + */ + @Ensures("(result & 0xFF) >= 1 && (result & 0xFF) <= (maxQual & 0xFF)") + public static byte trueProbToQual(final double trueProb, final byte maxQual) { + if ( ! MathUtils.goodProbability(trueProb) ) throw new IllegalArgumentException("trueProb must be good probability but got " + trueProb); + final double lp = Math.round(-10.0*MathUtils.log10OneMinusX(trueProb)); + return boundQual((int)lp, maxQual); + } + + /** + * @see #trueProbToQual(double, byte) with proper conversion of maxQual to a byte + */ + public static byte trueProbToQual(final double prob, final int maxQual) { + if ( maxQual < 0 || maxQual > 255 ) throw new IllegalArgumentException("maxQual must be between 0-255 but got " + maxQual); + return trueProbToQual(prob, (byte)(maxQual & 0xFF)); + } + + /** + * Convert a probability of being right to a phred-scaled quality score of being wrong as a double + * + * This is a very generic method, that simply computes a phred-scaled double quality + * score given an error rate. It has the same precision as a normal double operation + * + * @param trueRate the probability of being right (0.0-1.0) + * @return a phred-scaled version of the error rate implied by trueRate + */ + @Ensures("result >= 0.0") + public static double phredScaleCorrectRate(final double trueRate) { + return phredScaleLog10ErrorRate(MathUtils.log10OneMinusX(trueRate)); + } + + /** + * Convert a log10 probability of being right to a phred-scaled quality score of being wrong as a double + * + * This is a very generic method, that simply computes a phred-scaled double quality + * score given an error rate. It has the same precision as a normal double operation + * + * @param trueRateLog10 the log10 probability of being right (0.0-1.0). Can be -Infinity to indicate + * that the result is impossible in which MIN_PHRED_SCALED_QUAL is returned + * @return a phred-scaled version of the error rate implied by trueRate + */ + @Ensures("result >= 0.0") + public static double phredScaleLog10CorrectRate(final double trueRateLog10) { + return phredScaleCorrectRate(Math.pow(10.0, trueRateLog10)); + } + + /** + * Convert a probability of being wrong to a phred-scaled quality score of being wrong as a double + * + * This is a very generic method, that simply computes a phred-scaled double quality + * score given an error rate. It has the same precision as a normal double operation + * + * @param errorRate the probability of being wrong (0.0-1.0) + * @return a phred-scaled version of the error rate + */ + @Ensures("result >= 0.0") + public static double phredScaleErrorRate(final double errorRate) { + return phredScaleLog10ErrorRate(Math.log10(errorRate)); + } + + /** + * Convert a log10 probability of being wrong to a phred-scaled quality score of being wrong as a double + * + * This is a very generic method, that simply computes a phred-scaled double quality + * score given an error rate. It has the same precision as a normal double operation + * + * @param errorRateLog10 the log10 probability of being wrong (0.0-1.0). Can be -Infinity, in which case + * the result is MIN_PHRED_SCALED_QUAL + * @return a phred-scaled version of the error rate + */ + @Ensures("result >= 0.0") + public static double phredScaleLog10ErrorRate(final double errorRateLog10) { + if ( ! MathUtils.goodLog10Probability(errorRateLog10) ) throw new IllegalArgumentException("errorRateLog10 must be good probability but got " + errorRateLog10); + // abs is necessary for edge base with errorRateLog10 = 0 producing -0.0 doubles + return Math.abs(-10.0 * Math.max(errorRateLog10, RAW_MIN_PHRED_SCALED_QUAL)); + } + + // ---------------------------------------------------------------------- + // + // Routines to bound a quality score to a reasonable range + // + // ---------------------------------------------------------------------- + + /** + * Return a quality score that bounds qual by MAX_SAM_QUAL_SCORE and 1 + * + * @param qual the uncapped quality score as an integer + * @return the bounded quality score + */ + @Ensures("(result & 0xFF) >= 1 && (result & 0xFF) <= (MAX_SAM_QUAL_SCORE & 0xFF)") + public static byte boundQual(int qual) { + return boundQual(qual, MAX_SAM_QUAL_SCORE); + } + + /** + * Return a quality score that bounds qual by maxQual and 1 + * + * WARNING -- because this function takes a byte for maxQual, you must be careful in converting + * integers to byte. The appropriate way to do this is ((byte)(myInt & 0xFF)) + * + * @param qual the uncapped quality score as an integer. Can be < 0 (which may indicate an error in the + * client code), which will be brought back to 1, but this isn't an error, as some + * routines may use this functionality (BaseRecalibrator, for example) + * @param maxQual the maximum quality score, must be less < 255 + * @return the bounded quality score + */ + @Ensures("(result & 0xFF) >= 1 && (result & 0xFF) <= (maxQual & 0xFF)") + public static byte boundQual(final int qual, final byte maxQual) { + return (byte) (Math.max(Math.min(qual, maxQual & 0xFF), 1) & 0xFF); + } + + } + + diff --git a/public/java/src/org/broadinstitute/sting/utils/R/RScriptExecutor.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/R/RScriptExecutor.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/R/RScriptExecutor.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/R/RScriptExecutor.java diff --git a/public/java/src/org/broadinstitute/sting/utils/R/RScriptExecutorException.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/R/RScriptExecutorException.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/R/RScriptExecutorException.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/R/RScriptExecutorException.java diff --git a/public/java/src/org/broadinstitute/sting/utils/R/RScriptLibrary.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/R/RScriptLibrary.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/R/RScriptLibrary.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/R/RScriptLibrary.java diff --git a/public/java/src/org/broadinstitute/sting/utils/R/RUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/R/RUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/R/RUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/R/RUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/SampleUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/SampleUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/SampleUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/SequenceDictionaryUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/SequenceDictionaryUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/SequenceDictionaryUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/SequenceDictionaryUtils.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/SimpleTimer.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/SimpleTimer.java new file mode 100644 index 000000000..59516f196 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/SimpleTimer.java @@ -0,0 +1,261 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils; + + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; + +import org.apache.log4j.Logger; + +import java.text.NumberFormat; +import java.util.concurrent.TimeUnit; +import static java.lang.Math.abs; + +/** + * A useful simple system for timing code with nano second resolution + * + * Note that this code is not thread-safe. If you have a single timer + * being started and stopped by multiple threads you will need to protect the + * calls to avoid meaningless results of having multiple starts and stops + * called sequentially. + * + * This timer has been modified to provide better semantics for dealing with + * system-level checkpoint and restarting. Such events can cause the internal JVM + * clock to be reset, breaking timings based upon it. Whilst this is difficult to + * counter without getting explicit notice of checkpoint events, we try to moderate + * the symptoms through tracking the offset between the system clock and the JVM clock. + * If this offset grows drastically (greater than CLOCK_DRIFT), we infer a JVM restart + * and reset the timer. + * + * User: depristo + * Date: Dec 10, 2010 + * Time: 9:07:44 AM + */ +public class SimpleTimer { + private final static Logger logger = Logger.getLogger(SimpleTimer.class); + protected static final double NANO_TO_SECOND_DOUBLE = 1.0 / TimeUnit.SECONDS.toNanos(1); + private static final long MILLI_TO_NANO= TimeUnit.MILLISECONDS.toNanos(1); + private static final ThreadLocal NUMBER_FORMAT = new ThreadLocal() { + @Override + protected NumberFormat initialValue() { + return NumberFormat.getIntegerInstance(); + } + }; + + /** + * Allowable clock drift in nanoseconds. + */ + private static final long CLOCK_DRIFT = TimeUnit.SECONDS.toNanos(5); + private final String name; + + /** + * The difference between system time and JVM time at last sync. + * This is used to detect JVM checkpoint/restart events, and should be + * reset when a JVM checkpoint/restart is detected. + */ + private long nanoTimeOffset; + + /** + * The elapsedTimeNano time in nanoSeconds of this timer. The elapsedTimeNano time is the + * sum of times between starts/restrats and stops. + */ + private long elapsedTimeNano = 0l; + + /** + * The start time of the last start/restart in nanoSeconds + */ + private long startTimeNano = 0l; + + /** + * Is this timer currently running (i.e., the last call was start/restart) + */ + private boolean running = false; + + /** + * Creates an anonymous simple timer + */ + public SimpleTimer() { + this("Anonymous"); + } + + /** + * Creates a simple timer named name + * @param name of the timer, must not be null + */ + public SimpleTimer(final String name) { + if ( name == null ) throw new IllegalArgumentException("SimpleTimer name cannot be null"); + this.name = name; + + this.nanoTimeOffset = getNanoOffset(); + } + + /** + * @return the name associated with this timer + */ + public synchronized String getName() { + return name; + } + + /** + * Starts the timer running, and sets the elapsedTimeNano time to 0. This is equivalent to + * resetting the time to have no history at all. + * + * @return this object, for programming convenience + */ + @Ensures("elapsedTimeNano == 0l") + public synchronized SimpleTimer start() { + elapsedTimeNano = 0l; + return restart(); + } + + /** + * Starts the timer running, without resetting the elapsedTimeNano time. This function may be + * called without first calling start(). The only difference between start and restart + * is that start resets the elapsedTimeNano time, while restart does not. + * + * @return this object, for programming convenience + */ + public synchronized SimpleTimer restart() { + running = true; + startTimeNano = currentTimeNano(); + nanoTimeOffset = getNanoOffset(); + return this; + } + + /** + * @return is this timer running? + */ + public synchronized boolean isRunning() { + return running; + } + + /** + * @return A convenience function to obtain the current time in milliseconds from this timer + */ + public long currentTime() { + return System.currentTimeMillis(); + } + + /** + * @return A convenience function to obtain the current time in nanoSeconds from this timer + */ + public long currentTimeNano() { + return System.nanoTime(); + } + + /** + * Stops the timer. Increases the elapsedTimeNano time by difference between start and now. + * This method calls `ensureClockSync` to make sure that the JVM and system clocks + * are roughly in sync since the start of the timer. If they are not, then the time + * elapsed since the previous 'stop' will not be added to the timer. + * + * It's ok to call stop on a timer that's not running. It has no effect on the timer. + * + * @return this object, for programming convenience + */ + @Requires("startTimeNano != 0l") + public synchronized SimpleTimer stop() { + if ( running ) { + running = false; + if (ensureClockSync()) { + elapsedTimeNano += currentTimeNano() - startTimeNano; + } + } + return this; + } + + /** + * Returns the total elapsedTimeNano time of all start/stops of this timer. If the timer is currently + * running, includes the difference from currentTime() and the start as well + * + * @return this time, in seconds + */ + public synchronized double getElapsedTime() { + return nanoToSecondsAsDouble(getElapsedTimeNano()); + } + + protected static double nanoToSecondsAsDouble(final long nano) { + return nano * NANO_TO_SECOND_DOUBLE; + } + + /** + * @see #getElapsedTime() but returns the result in nanoseconds + * + * @return the elapsed time in nanoseconds + */ + public synchronized long getElapsedTimeNano() { + if (running && ensureClockSync()) { + return currentTimeNano() - startTimeNano + elapsedTimeNano; + } else { + return elapsedTimeNano; + } + } + + /** + * Add the elapsed time from toAdd to this elapsed time + * + * @param toAdd the timer whose elapsed time we want to add to this timer + */ + public synchronized void addElapsed(final SimpleTimer toAdd) { + elapsedTimeNano += toAdd.getElapsedTimeNano(); + } + + /** + * Get the current offset of nano time from system time. + */ + private static long getNanoOffset() { + return System.nanoTime() - (System.currentTimeMillis() * MILLI_TO_NANO); + } + + /** + * Ensure that the JVM time has remained in sync with system time. + * This will also reset the clocks to avoid gradual drift. + * + * @return true if the clocks are in sync, false otherwise + */ + private boolean ensureClockSync() { + final long currentOffset = getNanoOffset(); + final long diff = abs(currentOffset - nanoTimeOffset); + final boolean ret = (diff <= CLOCK_DRIFT); + if (!ret) { + final NumberFormat numberFormat = NUMBER_FORMAT.get(); + final String msg = String.format( + "Clock drift of %s - %s = %s nanoseconds detected, vs. max allowable drift of %s. " + + "Assuming checkpoint/restart event.", + numberFormat.format(currentOffset), + numberFormat.format(nanoTimeOffset), + numberFormat.format(diff), + numberFormat.format(CLOCK_DRIFT)); + // Log message + logger.warn(msg); + } + // Reset the drift meter to stay in sync. + this.nanoTimeOffset = currentOffset; + return ret; + } + +} diff --git a/public/java/src/org/broadinstitute/sting/utils/UnvalidatingGenomeLoc.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/UnvalidatingGenomeLoc.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/UnvalidatingGenomeLoc.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/UnvalidatingGenomeLoc.java diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/Utils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/Utils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/Utils.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java new file mode 100644 index 000000000..0c819b4fb --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java @@ -0,0 +1,500 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.activeregion; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Invariant; +import net.sf.picard.reference.IndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.HasGenomeLocation; +import org.broadinstitute.sting.utils.clipping.ReadClipper; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; + +import java.util.*; + +/** + * Represents a single active region created by the Active Region Traversal for processing + * + * An active region is a single contiguous span of bases on the genome that should be operated + * on as a single unit for the active region traversal. The action may contains a list of + * reads that overlap the region (may because there may be no reads in the region). The region + * is tagged as being either active or inactive, depending on the probabilities provided by + * the isActiveProb results from the ART walker. Each region carries with it the + * exact span of the region (bases which are the core of the isActiveProbs from the walker) as + * well as an extended size, that includes the ART walker's extension size. Reads in the region + * provided by ART include all reads overlapping the extended span, not the raw span. + * + * User: rpoplin + * Date: 1/4/12 + */ +@Invariant({ + "extension >= 0", + "activeRegionLoc != null", + "genomeLocParser != null", + "spanIncludingReads != null", + "extendedLoc != null" +}) +public class ActiveRegion implements HasGenomeLocation { + /** + * The reads included in this active region. May be empty upon creation, and expand / contract + * as reads are added or removed from this region. + */ + private final List reads = new ArrayList(); + + /** + * An ordered list (by genomic coordinate) of the ActivityProfileStates that went + * into this active region. May be empty, which says that no supporting states were + * provided when this region was created. + */ + private final List supportingStates; + + /** + * The raw span of this active region, not including the active region extension + */ + private final GenomeLoc activeRegionLoc; + + /** + * The span of this active region on the genome, including the active region extension + */ + private final GenomeLoc extendedLoc; + + /** + * The extension, in bp, of this active region. + */ + private final int extension; + + /** + * A genomeLocParser so we can create genomeLocs + */ + private final GenomeLocParser genomeLocParser; + + /** + * Does this region represent an active region (all isActiveProbs above threshold) or + * an inactive region (all isActiveProbs below threshold)? + */ + private final boolean isActive; + + /** + * The span of this active region, including the bp covered by all reads in this + * region. This union of extensionLoc and the loc of all reads in this region. + * + * Must be at least as large as extendedLoc, but may be larger when reads + * partially overlap this region. + */ + private GenomeLoc spanIncludingReads; + + + /** + * Indicates whether the active region has been finalized + */ + private boolean hasBeenFinalized; + + /** + * Create a new ActiveRegion containing no reads + * + * @param activeRegionLoc the span of this active region + * @param supportingStates the states that went into creating this region, or null / empty if none are available. + * If not empty, must have exactly one state for each bp in activeRegionLoc + * @param isActive indicates whether this is an active region, or an inactve one + * @param genomeLocParser a non-null parser to let us create new genome locs + * @param extension the active region extension to use for this active region + */ + public ActiveRegion( final GenomeLoc activeRegionLoc, final List supportingStates, final boolean isActive, final GenomeLocParser genomeLocParser, final int extension ) { + if ( activeRegionLoc == null ) throw new IllegalArgumentException("activeRegionLoc cannot be null"); + if ( activeRegionLoc.size() == 0 ) throw new IllegalArgumentException("Active region cannot be of zero size, but got " + activeRegionLoc); + if ( genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser cannot be null"); + if ( extension < 0 ) throw new IllegalArgumentException("extension cannot be < 0 but got " + extension); + + this.activeRegionLoc = activeRegionLoc; + this.supportingStates = supportingStates == null ? Collections.emptyList() : new ArrayList(supportingStates); + this.isActive = isActive; + this.genomeLocParser = genomeLocParser; + this.extension = extension; + this.extendedLoc = genomeLocParser.createGenomeLocOnContig(activeRegionLoc.getContig(), activeRegionLoc.getStart() - extension, activeRegionLoc.getStop() + extension); + this.spanIncludingReads = extendedLoc; + + if ( ! this.supportingStates.isEmpty() ) { + if ( this.supportingStates.size() != activeRegionLoc.size() ) + throw new IllegalArgumentException("Supporting states wasn't empty but it doesn't have exactly one state per bp in the active region: states " + this.supportingStates.size() + " vs. bp in region = " + activeRegionLoc.size()); + GenomeLoc lastStateLoc = null; + for ( final ActivityProfileState state : this.supportingStates ) { + if ( lastStateLoc != null ) { + if ( state.getLoc().getStart() != lastStateLoc.getStart() + 1 || state.getLoc().getContigIndex() != lastStateLoc.getContigIndex()) + throw new IllegalArgumentException("Supporting state has an invalid sequence: last state was " + lastStateLoc + " but next state was " + state); + } + lastStateLoc = state.getLoc(); + } + } + } + + /** + * Simple interface to create an active region that isActive without any profile state + */ + public ActiveRegion( final GenomeLoc activeRegionLoc, final GenomeLocParser genomeLocParser, final int extension ) { + this(activeRegionLoc, Collections.emptyList(), true, genomeLocParser, extension); + } + + @Override + public String toString() { + return "ActiveRegion " + activeRegionLoc.toString() + " active?=" + isActive() + " nReads=" + reads.size(); + } + + /** + * See #getActiveRegionReference but with padding == 0 + */ + public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader ) { + return getActiveRegionReference(referenceReader, 0); + } + + /** + * Get the reference bases from referenceReader spanned by the extended location of this active region, + * including additional padding bp on either side. If this expanded region would exceed the boundaries + * of the active region's contig, the returned result will be truncated to only include on-genome reference + * bases + * @param referenceReader the source of the reference genome bases + * @param padding the padding, in BP, we want to add to either side of this active region extended region + * @return a non-null array of bytes holding the reference bases in referenceReader + */ + @Ensures("result != null") + public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader, final int padding ) { + return getReference(referenceReader, padding, extendedLoc); + } + + /** + * See #getActiveRegionReference but using the span including regions not the extended span + */ + public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader ) { + return getFullReference(referenceReader, 0); + } + + /** + * See #getActiveRegionReference but using the span including regions not the extended span + */ + public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader, final int padding ) { + return getReference(referenceReader, padding, spanIncludingReads); + } + + /** + * Get the reference bases from referenceReader spanned by the extended location of this active region, + * including additional padding bp on either side. If this expanded region would exceed the boundaries + * of the active region's contig, the returned result will be truncated to only include on-genome reference + * bases + * @param referenceReader the source of the reference genome bases + * @param padding the padding, in BP, we want to add to either side of this active region extended region + * @param genomeLoc a non-null genome loc indicating the base span of the bp we'd like to get the reference for + * @return a non-null array of bytes holding the reference bases in referenceReader + */ + @Ensures("result != null") + public byte[] getReference( final IndexedFastaSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) { + if ( referenceReader == null ) throw new IllegalArgumentException("referenceReader cannot be null"); + if ( padding < 0 ) throw new IllegalArgumentException("padding must be a positive integer but got " + padding); + if ( genomeLoc == null ) throw new IllegalArgumentException("genomeLoc cannot be null"); + if ( genomeLoc.size() == 0 ) throw new IllegalArgumentException("GenomeLoc must have size > 0 but got " + genomeLoc); + + final byte[] reference = referenceReader.getSubsequenceAt( genomeLoc.getContig(), + Math.max(1, genomeLoc.getStart() - padding), + Math.min(referenceReader.getSequenceDictionary().getSequence(genomeLoc.getContig()).getSequenceLength(), genomeLoc.getStop() + padding) ).getBases(); + + return reference; + } + + /** + * Get the raw span of this active region (excluding the extension) + * @return a non-null genome loc + */ + @Override + @Ensures("result != null") + public GenomeLoc getLocation() { return activeRegionLoc; } + + /** + * Get the span of this active region including the extension value + * @return a non-null GenomeLoc + */ + @Ensures("result != null") + public GenomeLoc getExtendedLoc() { return extendedLoc; } + + /** + * Get the span of this active region including the extension and the projects on the + * genome of all reads in this active region. That is, returns the bp covered by this + * region and all reads in the region. + * @return a non-null genome loc + */ + @Ensures("result != null") + public GenomeLoc getReadSpanLoc() { return spanIncludingReads; } + + /** + * Get the active profile states that went into creating this region, if possible + * @return an unmodifiable list of states that led to the creation of this region, or an empty + * list if none were provided + */ + @Ensures("result != null") + public List getSupportingStates() { + return Collections.unmodifiableList(supportingStates); + } + + /** + * Get the active region extension applied to this region + * + * The extension is >= 0 bp in size, and indicates how much padding this art walker wanted for its regions + * + * @return the size in bp of the region extension + */ + @Ensures("result >= 0") + public int getExtension() { return extension; } + + /** + * Get an unmodifiable list of reads currently in this active region. + * + * The reads are sorted by their coordinate position + * + * @return an unmodifiable list of reads in this active region + */ + @Ensures("result != null") + public List getReads() { + return Collections.unmodifiableList(reads); + } + + /** + * Get the number of reads currently in this active region + * @return an integer >= 0 + */ + @Ensures("result >= 0") + public int size() { return reads.size(); } + + /** + * Add read to this active region + * + * Read must have alignment start >= than the last read currently in this active region. + * + * @throws IllegalArgumentException if read doesn't overlap the extended region of this active region + * + * @param read a non-null GATKSAMRecord + */ + @Ensures("reads.size() == old(reads.size()) + 1") + public void add( final GATKSAMRecord read ) { + if ( read == null ) throw new IllegalArgumentException("Read cannot be null"); + + final GenomeLoc readLoc = genomeLocParser.createGenomeLoc( read ); + if ( ! readOverlapsRegion(read) ) + throw new IllegalArgumentException("Read location " + readLoc + " doesn't overlap with active region extended span " + extendedLoc); + + spanIncludingReads = spanIncludingReads.union( readLoc ); + + if ( ! reads.isEmpty() ) { + final GATKSAMRecord lastRead = reads.get(size() - 1); + if ( ! lastRead.getReferenceIndex().equals(read.getReferenceIndex()) ) + throw new IllegalArgumentException("Attempting to add a read to ActiveRegion not on the same contig as other reads: lastRead " + lastRead + " attempting to add " + read); + + if ( read.getAlignmentStart() < lastRead.getAlignmentStart() ) + throw new IllegalArgumentException("Attempting to add a read to ActiveRegion out of order w.r.t. other reads: lastRead " + lastRead + " at " + lastRead.getAlignmentStart() + " attempting to add " + read + " at " + read.getAlignmentStart()); + } + + reads.add( read ); + } + + /** + * Returns true if read would overlap the extended extent of this region + * @param read the read we want to test + * @return true if read can be added to this region, false otherwise + */ + public boolean readOverlapsRegion(final GATKSAMRecord read) { + final GenomeLoc readLoc = genomeLocParser.createGenomeLoc( read ); + return readLoc.overlapsP(extendedLoc); + } + + /** + * Add all reads to this active region + * @param reads a collection of reads to add to this active region + */ + public void addAll(final Collection reads) { + if ( reads == null ) throw new IllegalArgumentException("reads cannot be null"); + for ( final GATKSAMRecord read : reads ) + add(read); + } + + /** + * Clear all of the reads currently in this active region + */ + @Ensures("size() == 0") + public void clearReads() { + spanIncludingReads = extendedLoc; + reads.clear(); + } + + /** + * Remove all of the reads in readsToRemove from this active region + * @param readsToRemove the set of reads we want to remove + */ + public void removeAll( final Set readsToRemove ) { + final Iterator it = reads.iterator(); + spanIncludingReads = extendedLoc; + while ( it.hasNext() ) { + final GATKSAMRecord read = it.next(); + if ( readsToRemove.contains(read) ) + it.remove(); + else + spanIncludingReads = spanIncludingReads.union( genomeLocParser.createGenomeLoc(read) ); + } + } + + /** + * Is this region equal to other, excluding any reads in either region in the comparison + * @param other the other active region we want to test + * @return true if this region is equal, excluding any reads and derived values, to other + */ + protected boolean equalExceptReads(final ActiveRegion other) { + if ( activeRegionLoc.compareTo(other.activeRegionLoc) != 0 ) return false; + if ( isActive() != other.isActive()) return false; + if ( genomeLocParser != other.genomeLocParser ) return false; + if ( extension != other.extension ) return false; + if ( extendedLoc.compareTo(other.extendedLoc) != 0 ) return false; + return true; + } + + /** + * Does this region represent an active region (all isActiveProbs above threshold) or + * an inactive region (all isActiveProbs below threshold)? + */ + public boolean isActive() { + return isActive; + } + + /** + * Intersect this active region with the allowed intervals, returning a list of active regions + * that only contain locations present in intervals + * + * Note that the returned list may be empty, if this active region doesn't overlap the set at all + * + * Note that the resulting regions are all empty, regardless of whether the current active region has reads + * + * @param intervals a non-null set of intervals that are allowed + * @return an ordered list of active region where each interval is contained within intervals + */ + @Ensures("result != null") + protected List splitAndTrimToIntervals(final GenomeLocSortedSet intervals) { + final List allOverlapping = intervals.getOverlapping(getLocation()); + final List clippedRegions = new LinkedList(); + + for ( final GenomeLoc overlapping : allOverlapping ) { + clippedRegions.add(trim(overlapping, extension)); + } + + return clippedRegions; + } + + /** + * Trim this active to just the span, producing a new active region without any reads that has only + * the extent of newExtend intersected with the current extent + * @param span the new extend of the active region we want + * @param extension the extension size we want for the newly trimmed active region + * @return a non-null, empty active region + */ + public ActiveRegion trim(final GenomeLoc span, final int extension) { + if ( span == null ) throw new IllegalArgumentException("Active region extent cannot be null"); + if ( extension < 0) throw new IllegalArgumentException("the extension size must be 0 or greater"); + final int extendStart = Math.max(1,span.getStart() - extension); + final int maxStop = genomeLocParser.getContigs().getSequence(span.getContigIndex()).getSequenceLength(); + final int extendStop = Math.min(span.getStop() + extension, maxStop); + final GenomeLoc extendedSpan = genomeLocParser.createGenomeLoc(span.getContig(), extendStart, extendStop); + return trim(span, extendedSpan); + +//TODO - Inconsiste support of substates trimming. Check lack of consistency!!!! +// final GenomeLoc subLoc = getLocation().intersect(span); +// final int subStart = subLoc.getStart() - getLocation().getStart(); +// final int subEnd = subStart + subLoc.size(); +// final List subStates = supportingStates.isEmpty() ? supportingStates : supportingStates.subList(subStart, subEnd); +// return new ActiveRegion( subLoc, subStates, isActive, genomeLocParser, extension ); + + } + + public ActiveRegion trim(final GenomeLoc span) { + return trim(span,span); + } + + /** + * Trim this active to no more than the span, producing a new active region with properly trimmed reads that + * attempts to provide the best possible representation of this active region covering the span. + * + * The challenge here is that span may (1) be larger than can be represented by this active region + * + its original extension and (2) the extension must be symmetric on both sides. This algorithm + * therefore determines how best to represent span as a subset of the span of this + * region with a padding value that captures as much of the span as possible. + * + * For example, suppose this active region is + * + * Active: 100-200 with extension of 50, so that the true span is 50-250 + * NewExtent: 150-225 saying that we'd ideally like to just have bases 150-225 + * + * Here we represent the active region as a active region from 150-200 with 25 bp of padding. + * + * The overall constraint is that the active region can never exceed the original active region, and + * the extension is chosen to maximize overlap with the desired region + * + * @param span the new extend of the active region we want + * @return a non-null, empty active region + */ + public ActiveRegion trim(final GenomeLoc span, final GenomeLoc extendedSpan) { + if ( span == null ) throw new IllegalArgumentException("Active region extent cannot be null"); + if ( extendedSpan == null ) throw new IllegalArgumentException("Active region extended span cannot be null"); + if ( ! extendedSpan.containsP(span)) + throw new IllegalArgumentException("The requested extended must fully contain the requested span"); + + final GenomeLoc subActive = getLocation().intersect(span); + final int requiredOnRight = Math.max(extendedSpan.getStop() - subActive.getStop(), 0); + final int requiredOnLeft = Math.max(subActive.getStart() - extendedSpan.getStart(), 0); + final int requiredExtension = Math.min(Math.max(requiredOnLeft, requiredOnRight), getExtension()); + + final ActiveRegion result = new ActiveRegion( subActive, Collections.emptyList(), isActive, genomeLocParser, requiredExtension ); + + final List myReads = getReads(); + final GenomeLoc resultExtendedLoc = result.getExtendedLoc(); + final int resultExtendedLocStart = resultExtendedLoc.getStart(); + final int resultExtendedLocStop = resultExtendedLoc.getStop(); + + final List trimmedReads = new ArrayList<>(myReads.size()); + for( final GATKSAMRecord read : myReads ) { + final GATKSAMRecord clippedRead = ReadClipper.hardClipToRegion(read, + resultExtendedLocStart, resultExtendedLocStop); + if( result.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) + trimmedReads.add(clippedRead); + } + result.clearReads(); + result.addAll(ReadUtils.sortReadsByCoordinate(trimmedReads)); + return result; + } + + public void setFinalized(final boolean value) { + hasBeenFinalized = value; + } + + public boolean isFinalized() { + return hasBeenFinalized; + } + +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegionReadState.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/activeregion/ActiveRegionReadState.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegionReadState.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/activeregion/ActiveRegionReadState.java diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfileState.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/activeregion/ActivityProfileState.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfileState.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/activeregion/ActivityProfileState.java diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfile.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfile.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfile.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfile.java diff --git a/public/java/src/org/broadinstitute/sting/utils/analysis/AminoAcid.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/analysis/AminoAcid.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/analysis/AminoAcid.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/analysis/AminoAcid.java diff --git a/public/java/src/org/broadinstitute/sting/utils/analysis/AminoAcidTable.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/analysis/AminoAcidTable.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/analysis/AminoAcidTable.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/analysis/AminoAcidTable.java diff --git a/public/java/src/org/broadinstitute/sting/utils/analysis/AminoAcidUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/analysis/AminoAcidUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/analysis/AminoAcidUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/analysis/AminoAcidUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/baq/BAQ.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/baq/BAQ.java diff --git a/public/java/src/org/broadinstitute/sting/utils/baq/BAQReadTransformer.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/baq/BAQReadTransformer.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/baq/BAQReadTransformer.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/baq/BAQReadTransformer.java diff --git a/public/java/src/org/broadinstitute/sting/utils/baq/ReadTransformingIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/baq/ReadTransformingIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/baq/ReadTransformingIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/baq/ReadTransformingIterator.java diff --git a/public/java/src/org/broadinstitute/sting/utils/classloader/JVMUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/classloader/JVMUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/classloader/JVMUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/classloader/JVMUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/classloader/PluginManager.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/classloader/PluginManager.java diff --git a/public/java/src/org/broadinstitute/sting/utils/classloader/ProtectedPackageSource.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/classloader/ProtectedPackageSource.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/classloader/ProtectedPackageSource.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/classloader/ProtectedPackageSource.java diff --git a/public/java/src/org/broadinstitute/sting/utils/classloader/PublicPackageSource.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/classloader/PublicPackageSource.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/classloader/PublicPackageSource.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/classloader/PublicPackageSource.java diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/clipping/ClippingOp.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/clipping/ClippingOp.java diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingRepresentation.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/clipping/ClippingRepresentation.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/clipping/ClippingRepresentation.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/clipping/ClippingRepresentation.java diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/clipping/ReadClipper.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/clipping/ReadClipper.java diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/beagle/BeagleCodec.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/beagle/BeagleCodec.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/codecs/beagle/BeagleCodec.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/beagle/BeagleCodec.java diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/beagle/BeagleFeature.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/beagle/BeagleFeature.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/codecs/beagle/BeagleFeature.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/beagle/BeagleFeature.java diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapFeature.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapFeature.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapFeature.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapFeature.java diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqFeature.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/refseq/RefSeqFeature.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqFeature.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/refseq/RefSeqFeature.java diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/Transcript.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/refseq/Transcript.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/codecs/refseq/Transcript.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/refseq/Transcript.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java new file mode 100644 index 000000000..70241a6c4 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java @@ -0,0 +1,354 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.codecs.sampileup; + +import org.broad.tribble.AsciiFeatureCodec; +import org.broad.tribble.exception.CodecLineParsingException; +import org.broad.tribble.readers.LineIterator; +import org.broad.tribble.util.ParsingUtils; + +import java.util.ArrayList; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import static org.broadinstitute.sting.utils.codecs.sampileup.SAMPileupFeature.VariantType; + +/** + * Decoder for SAM pileup data. + * + *

+ * From the SAMTools project documentation: + *

+ *

The Pileup format was first used by Tony Cox and Zemin Ning at + * the Sanger Institute. It describes the base-pair information at each chromosomal position. This format + * facilitates SNP/indel calling and brief alignment viewing by eye. Note that the pileup program has been replaced + * in Samtools by mpileup, which produces a slightly different output format by default. + *

+ + *

Format

+ *

There are two versions of the original pileup format: the current 6-column format produced by Samtools, and the old + * 10/13-column "consensus" format which could be obtained by using the -c argument, now deprecated.

+ *

Simple pileup: 6-column format

+ *

+ * Each line consists of chromosome, 1-based coordinate, reference base, the + * number of reads covering the site, read bases and base qualities. At the + * read base column, a dot stands for a match to the reference base on the + * forward strand, a comma for a match on the reverse strand, `ACGTN' for a mismatch + * on the forward strand and `acgtn' for a mismatch on the reverse strand. + * A pattern `\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion between + * this reference position and the next reference position. The length of the + * insertion is given by the integer in the pattern, followed by the inserted sequence. + *

+ *
+ *     seq1 272 T 24  ,.$.....,,.,.,...,,,.,..^+. <<<+;<<<<<<<<<<<=<;<;7<&
+ *     seq1 273 T 23  ,.....,,.,.,...,,,.,..A <<<;<<<<<<<<<3<=<<<;<<+
+ *     seq1 274 T 23  ,.$....,,.,.,...,,,.,...    7<7;<;<<<<<<<<<=<;<;<<6
+ *     seq1 275 A 23  ,$....,,.,.,...,,,.,...^l.  <+;9*<<<<<<<<<=<<:;<<<<
+ *     seq1 276 G 22  ...T,,.,.,...,,,.,....  33;+<<7=7<<7<&<<1;<<6<
+ *     seq1 277 T 22  ....,,.,.,.C.,,,.,..G.  +7<;<<<<<<<&<=<<:;<<&<
+ *     seq1 278 G 23  ....,,.,.,...,,,.,....^k.   %38*<<;<7<<7<=<<<;<<<<<
+ *     seq1 279 C 23  A..T,,.,.,...,,,.,..... ;75&<<<<<<<<<=<<<9<<:<<
+ * 
+ *

+ * See the Pileup format documentation for more details. + *

+ * + *

Consensus pileup: 10/13-column format

+ *

The "consensus" or extended pileup consists of the following: + *

    + *
  • original 6 columns as described above
  • + *
  • 4 extra columns representing consensus values (consensus base, consensus quality, variant quality and maximum mapping quality of the + * reads covering the sites) for all sites, inserted before the bases and quality strings
  • + *
  • 3 extra columns indicating counts of reads supporting indels (just for indel sites)
  • + *
+ *

+ *

Example of consensus pileup for SNP or non-variant sites

+ *
+ *     seq1  60  T  T  66  0  99  13  ...........^~.^~.   9<<55<;<<<<<<
+ *     seq1  61  G  G  72  0  99  15  .............^~.^y. (;975&;<<<<<<<<
+ *     seq1  62  T  T  72  0  99  15  .$..............    <;;,55;<<<<<<<<
+ *     seq1  63  G  G  72  0  99  15  .$.............^~.  4;2;<7:+<<<<<<<
+ *     seq1  64  G  G  69  0  99  14  ..............  9+5<;;;<<<<<<<
+ *     seq1  65  A  A  69  0  99  14  .$............. <5-2<;;<<<<<<;
+ *     seq1  66  C  C  66  0  99  13  .............   &*<;;<<<<<<8<
+ *     seq1  67  C  C  69  0  99  14  .............^~.    ,75<.4<<<<<-<<
+ *     seq1  68  C  C  69  0  99  14  ..............  576<;7<<<<<8<< *
+ * 
+ * + *

Example of consensus pileup for indels

+ *
+ *     Escherichia_coli_K12	3995037	*	*\/*	430	0	37	144	*	+A	143	1	0
+ *     Escherichia_coli_K12	3995279	*	*\/*	202	0	36	68	*	+A	67	1	0
+ *     Escherichia_coli_K12	3995281	*	*\/*	239	0	36	67	*	-CG	66	1	0
+ * 
+ *

+ * See Consensus pileup format (deprecated) for more details. + *

+ * + *

Caveat

+ *

Handling of indels is questionable at the moment. Proceed with care.

+ * + * + * @author Matt Hanna, Geraldine VdAuwera + * @since 2014 + */ +public class SAMPileupCodec extends AsciiFeatureCodec { + // number of tokens expected (6 or 10 are valid, anything else is wrong) + private static final int basicTokenCount = 6; + private static final int consensusSNPTokenCount = 10; + private static final int consensusIndelTokenCount = 13; + private static final char fldDelim = '\t'; + // allocate once and don't ever bother creating them again: + private static final String baseA = "A"; + private static final String baseC = "C"; + private static final String baseG = "G"; + private static final String baseT = "T"; + private static final String emptyStr = ""; // we will use this for "reference" allele in insertions + + public SAMPileupCodec() { + super(SAMPileupFeature.class); + } + + public SAMPileupFeature decode(String line) { + //+1 because we want to know if we have more than the max + String[] tokens = new String[consensusIndelTokenCount+1]; + + // split the line + final int count = ParsingUtils.split(line,tokens,fldDelim); + + SAMPileupFeature feature = new SAMPileupFeature(); + + /** + * Tokens 0, 1, 2 are the same for both formats so they will be interpreted without differentiation. + * The 10/13-format has 4 tokens inserted after token 2 compared to the 6-format, plus 3 more tokens added at + * the end for indels. We are currently not making any use of the extra indel tokens. + * + * Any token count other than basicTokenCount, consensusSNPTokenCount or consensusIndelTokenCount is wrong. + */ + final String observedString, bases, quals; + + feature.setChr(tokens[0]); + feature.setStart(Integer.parseInt(tokens[1])); + + if(tokens[2].length() != 1) { + throw new CodecLineParsingException("The SAM pileup line had unexpected base " + tokens[2] + " on line = " + line); + } + feature.setRef(tokens[2].charAt(0)); + + switch (count) { + case basicTokenCount: + bases = tokens[4]; + quals = tokens[5]; + // parsing is pretty straightforward for 6-col format + if ( feature.getRef() == '*' ) { // this indicates an indel -- but it shouldn't occur with vanilla 6-col format + throw new CodecLineParsingException("Found an indel on line = " + line + " but it shouldn't happen in simple pileup format"); + } else { + parseBasesAndQuals(feature, bases, quals); + feature.setRefBases(tokens[2].toUpperCase()); + feature.setEnd(feature.getStart()); + } + break; + case consensusSNPTokenCount: // pileup called a SNP or a reference base + observedString = tokens[3].toUpperCase(); + feature.setFWDAlleles(new ArrayList(2)); + feature.setConsensusConfidence(Double.parseDouble(tokens[4])); + feature.setVariantConfidence(Double.parseDouble(tokens[5])); + bases = tokens[8]; + quals = tokens[9]; + // confirm that we have a non-variant, not a mis-parsed indel + if ( feature.getRef() == '*' ) { + throw new CodecLineParsingException("Line parsing of " + line + " says we have a SNP or non-variant but the ref base is '*', which indicates an indel"); + } + // Parse the SNP or non-variant + parseBasesAndQuals(feature, bases, quals); + if ( observedString.length() != 1 ) { + throw new CodecLineParsingException( "Line parsing of " + line + " says we have a SNP or non-variant but the genotype token is not a single letter: " + observedString); + } + feature.setRefBases(tokens[2].toUpperCase()); + feature.setEnd(feature.getStart()); + + char ch = observedString.charAt(0); + + switch ( ch ) { // record alleles (decompose ambiguous base codes) + case 'A': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseA); break; + case 'C': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseC); break; + case 'G': feature.getFWDAlleles().add(baseG); feature.getFWDAlleles().add(baseG); break; + case 'T': feature.getFWDAlleles().add(baseT); feature.getFWDAlleles().add(baseT); break; + case 'M': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseC); break; + case 'R': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseG); break; + case 'W': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseT); break; + case 'S': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseG); break; + case 'Y': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseT); break; + case 'K': feature.getFWDAlleles().add(baseG); feature.getFWDAlleles().add(baseT); break; + } + if ( feature.getFWDAlleles().get(0).charAt(0) == feature.getRef() && feature.getFWDAlleles().get(1).charAt(0) == feature.getRef() ) feature.setVariantType(VariantType.NONE); + else { + // we know that at least one allele is non-ref; + // if one is ref and the other is non-ref, or if both are non ref but they are the same (i.e. + // homozygous non-ref), we still have 2 allelic variants at the site (e.g. one ref and one nonref) + feature.setVariantType(VariantType.SNP); + if ( feature.getFWDAlleles().get(0).charAt(0) == feature.getRef() || + feature.getFWDAlleles().get(1).charAt(0) == feature.getRef() || + feature.getFWDAlleles().get(0).equals(feature.getFWDAlleles().get(1)) + ) feature.setNumNonRef(1); + else feature.setNumNonRef(2); // if both observations differ from ref and they are not equal to one another, then we get multiallelic site... + } + break; + case consensusIndelTokenCount: + observedString = tokens[3].toUpperCase(); + feature.setFWDAlleles(new ArrayList(2)); + feature.setConsensusConfidence(Double.parseDouble(tokens[4])); + feature.setVariantConfidence(Double.parseDouble(tokens[5])); + // confirm that we have an indel, not a mis-parsed SNP or non-variant + if ( feature.getRef() != '*' ) { + throw new CodecLineParsingException("Line parsing of " + line + " says we have an indel but the ref base is not '*'"); + } + // Parse the indel + parseIndels(observedString,feature) ; + if ( feature.isDeletion() ) feature.setEnd(feature.getStart()+feature.length()-1); + else feature.setEnd(feature.getStart()); // if it's not a deletion and we are biallelic, this has got to be an insertion; otherwise the state is inconsistent!!!! + break; + default: + throw new CodecLineParsingException("The SAM pileup line didn't have the expected number of tokens " + + "(expected = " + basicTokenCount + " (basic pileup), " + consensusSNPTokenCount + + " (consensus pileup for a SNP or non-variant site) or " + consensusIndelTokenCount + + " (consensus pileup for an indel); saw = " + count + " on line = " + line + ")"); + } + return feature; + } + + @Override + public Object readActualHeader(LineIterator lineIterator) { + // No header for this format + return null; + } + + private void parseIndels(String genotype,SAMPileupFeature feature) { + String [] obs = genotype.split("/"); // get observations, now need to tinker with them a bit + + // if reference allele is among the observed alleles, we will need to take special care of it since we do not have direct access to the reference; + // if we have an insertion, the "reference" allele is going to be empty; if it it is a deletion, we will deduce the "reference allele" bases + // from what we have recorded for the deletion allele (e.g. "-CAC") + boolean hasRefAllele = false; + + for ( int i = 0 ; i < obs.length ; i++ ) { + if ( obs[i].length() == 1 && obs[i].charAt(0) == '*' ) { + hasRefAllele = true; + feature.getFWDAlleles().add(emptyStr); + continue; + } + + String varBases = obs[i].toUpperCase(); + + switch ( obs[i].charAt(0) ) { + case '+': + if (!feature.isReference() && !feature.isInsertion()) feature.setVariantType(VariantType.INDEL); + else feature.setVariantType(VariantType.INSERTION); + feature.setRefBases(emptyStr); + break; + case '-' : + if (!feature.isReference() && !feature.isDeletion()) feature.setVariantType(VariantType.INDEL); + else feature.setVariantType(VariantType.DELETION); + feature.setRefBases(varBases); // remember what was deleted, this will be saved as "reference allele" + break; + default: throw new CodecLineParsingException("Can not interpret observed indel allele record: "+genotype); + } + feature.getFWDAlleles().add(varBases); + feature.setLength(obs[i].length()-1); // inconsistent for non-biallelic indels!! + } + if ( hasRefAllele ) { + // we got at least one ref. allele (out of two recorded) + if (feature.isReference()) { // both top theories are actually ref allele; + feature.setNumNonRef(0); // no observations of non-reference allele at all + feature.setRefBases(emptyStr); + } else { + feature.setNumNonRef(1); // hasRefAllele = true, so one allele was definitely ref, hence there is only one left + } + } else { + // we observe two non-ref alleles; they better be the same variant, otherwise the site is not bi-allelic and at the moment we + // fail to set data in a consistent way. + if ( feature.getFWDAlleles().get(0).equals(feature.getFWDAlleles().get(1))) feature.setNumNonRef(1); + else feature.setNumNonRef(2); + } + // DONE with indels + + } + + private void parseBasesAndQuals(SAMPileupFeature feature, final String bases, final String quals) + { + //System.out.printf("%s%n%s%n", bases, quals); + + // needs to convert the base string with its . and , to the ref base + StringBuilder baseBuilder = new StringBuilder(); + StringBuilder qualBuilder = new StringBuilder(); + boolean done = false; + for ( int i = 0, j = 0; i < bases.length() && ! done; i++ ) { + //System.out.printf("%d %d%n", i, j); + char c = (char)bases.charAt(i); + + switch ( c ) { + case '.': // matches reference + case ',': // matches reference + baseBuilder.append(feature.getRef()); + qualBuilder.append(quals.charAt(j++)); + break; + case '$': // end of read + break; + case '*': // end of indel? + j++; + break; + case '^': // mapping quality + i++; + break; + case '+': // start of indel + case '-': // start of indel + final Pattern regex = Pattern.compile("([0-9]+).*"); // matches case 1 + final String rest = bases.substring(i+1); + //System.out.printf("sub is %s%n", rest); + Matcher match = regex.matcher(rest); + if ( ! match.matches() ) { + if ( feature.getRef() != '*' ) + throw new CodecLineParsingException("Bad pileup format: " + bases + " at position " + i); + done = true; + } + else { + String g = match.group(1); + //System.out.printf("group is %d, match is %s%n", match.groupCount(), g); + int l = Integer.parseInt(g); + i += l + g.length(); // length of number + that many bases + +/- at the start (included in the next i++) + //System.out.printf("remaining is %d => %s%n", l, bases.substring(i+1)); + } + break; + default: // non reference base + baseBuilder.append(c); + qualBuilder.append(quals.charAt(j++)); + } + } + + feature.setPileupBases(baseBuilder.toString()); + feature.setPileupQuals(qualBuilder.toString()); + } +} diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupFeature.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupFeature.java new file mode 100644 index 000000000..287363601 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupFeature.java @@ -0,0 +1,276 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.codecs.sampileup; + +import net.sf.samtools.util.StringUtil; +import org.broad.tribble.Feature; + +import java.util.List; + +/** + * A tribble feature representing a SAM pileup. + * + * Allows intake of both simple (6-column) or extended/consensus (10/13-column) pileups. Simple pileup features will + * contain only basic information, no observed alleles or variant/genotype inferences, and so shouldn't be used as + * input for analysis that requires that information. + * + * @author mhanna + * @version 0.1 + */ +public class SAMPileupFeature implements Feature { + public enum VariantType { NONE, SNP, INSERTION, DELETION, INDEL }; + + private String contig; // genomic location of this genotyped site + private int start; + private int stop; + + private char refBaseChar; // what we have set for the reference base (is set to a '*' for indel!) + private String refBases; // the reference base sequence according to NCBI; single base for point mutations, deleted bases for deletions, empty string for insertions + + private String pileupQuals; // the read base qualities + private String pileupBases; // the read bases themselves + + private List observedAlleles = null; // The sequences of the observed alleles (e.g. {"A","C"} for point mutation or {"","+CC"} for het. insertion + private VariantType varType = VariantType.NONE; + private int nNonref = 0; // number of non-reference alleles observed + private int eventLength = 0; // number of inserted or deleted bases + + private double consensusScore = 0; + private double variantScore = 0; + + /** + * create the pileup feature. Default protection so that only other classes in this package can create it. + */ + SAMPileupFeature() {} + + public String getChr() { + return contig; + } + + protected void setChr(String chr) { + this.contig = chr; + } + + public int getStart() { + return start; + } + + protected void setStart(int start) { + this.start = start; + } + + public int getEnd() { + return stop; + } + + protected void setEnd(int end) { + this.stop = end; + } + + public String getQualsAsString() { return pileupQuals; } + + protected void setPileupQuals(String pileupQuals) { + this.pileupQuals = pileupQuals; + } + + /** Returns reference base for point genotypes or '*' for indel genotypes, as a char. + * + */ + public char getRef() { return refBaseChar; } + + protected void setRef(char ref) { + this.refBaseChar = ref; + } + + public int size() { return pileupQuals.length(); } + + /** Returns pile of observed bases over the current genomic location. + * + */ + public String getBasesAsString() { return pileupBases; } + + protected void setPileupBases(String pileupBases) { + this.pileupBases = pileupBases; + } + + /** Returns formatted pileup string for the current genomic location as + * "location: reference_base observed_base_pile observed_qual_pile" + */ + public String getPileupString() + { + if(start == stop) + return String.format("%s:%d: %s %s %s", getChr(), getStart(), getRef(), getBasesAsString(), getQualsAsString()); + else + return String.format("%s:%d-%d: %s %s %s", getChr(), getStart(), getEnd(), getRef(), getBasesAsString(), getQualsAsString()); + } + + /** + * Gets the bases in byte array form. + * @return byte array of the available bases. + */ + public byte[] getBases() { + return StringUtil.stringToBytes(getBasesAsString()); + } + + /** + * Gets the Phred base qualities without ASCII offset. + * @return Phred base qualities. + */ + public byte[] getQuals() { + byte[] quals = StringUtil.stringToBytes(getQualsAsString()); + for(int i = 0; i < quals.length; i++) quals[i] -= 33; + return quals; + } + + /** Returns bases in the reference allele as a String. For point genotypes, the string consists of a single + * character (reference base). For indel genotypes, the string is empty for insertions into + * the reference, or consists of deleted bases for deletions. + * + * @return reference allele, forward strand + */ + public String getFWDRefBases() { + return refBases; + } + + protected void setRefBases(String refBases) { + this.refBases = refBases; + } + + public List getFWDAlleles() { + return observedAlleles; + } + + protected void setFWDAlleles(List alleles) { + this.observedAlleles = alleles; + } + + // ---------------------------------------------------------------------- + // + // What kind of variant are we? + // + // ---------------------------------------------------------------------- + public boolean isSNP() { return varType == VariantType.SNP; } + public boolean isInsertion() { return varType == VariantType.INSERTION; } + public boolean isDeletion() { return varType == VariantType.DELETION ; } + public boolean isIndel() { return isInsertion() || isDeletion() || varType == VariantType.INDEL; } + public boolean isReference() { return varType == VariantType.NONE; } + + protected void setVariantType(VariantType variantType) { + this.varType = variantType; + } + + public boolean isHom() { + // implementation-dependent: here we use the fact that for ref and snps we actually use fixed static strings to remember the genotype + if ( ! isIndel() ) return ( observedAlleles.get(0).equals(observedAlleles.get(1)) ); + return ( isInsertion() || isDeletion() ) && observedAlleles.get(0).equals(observedAlleles.get(1) ); + } + + public boolean isHet() { + // implementation-dependent: here we use the fact that for ref and snps we actually use fixed static strings to remember the genotype + if ( ! isIndel() ) return ( !(observedAlleles.get(0).equals(observedAlleles.get(1))) ); + return isIndel() || ( ! observedAlleles.get(0).equals(observedAlleles.get(1) ) ); + } + + public double getVariantConfidence() { + return variantScore; + } + + protected void setVariantConfidence(double variantScore) { + this.variantScore = variantScore; + } + + public boolean isBiallelic() { + return nNonref < 2; + } + + protected void setNumNonRef(int nNonref) { + this.nNonref = nNonref; + } + + public double getConsensusConfidence() { + return consensusScore; + } + + protected void setConsensusConfidence(double consensusScore) { + this.consensusScore = consensusScore; + } + + public int length() { + return eventLength; + } + + protected void setLength(int eventLength) { + this.eventLength = eventLength; + } + + public boolean isIndelGenotype() { + return refBaseChar == '*'; + } + + + public boolean isPointGenotype() { + return ! isIndelGenotype(); + } + + /** Implements method required by GenotypeList interface. If this object represents + * an indel genotype, then it returns itself through this method. If this object is a + * point genotype, this method returns null. + * @return + */ + public SAMPileupFeature getIndelGenotype() { + if ( isIndelGenotype() ) return this; + else return null; + } + + /** Implements method required by GenotypeList interface. If this object represents + * a point genotype, then it returns itself through this method. If this object is an + * indel genotype, this method returns null. + * @return + */ + public SAMPileupFeature getPointGenotype() { + if ( isPointGenotype() ) return this; + else return null; + } + + /** Returns true if this object \em is an indel genotype (and thus + * indel genotype is what it only has). + * @return + */ + public boolean hasIndelGenotype() { + return isIndelGenotype(); + } + + /** Returns true if this object \em is a point genotype (and thus + * point genotype is what it only has. + * @return + */ + public boolean hasPointGenotype() { + return isPointGenotype(); + } + + + +} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadCodec.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/samread/SAMReadCodec.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadCodec.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/samread/SAMReadCodec.java diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadFeature.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/samread/SAMReadFeature.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadFeature.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/samread/SAMReadFeature.java diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/table/BedTableCodec.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/table/BedTableCodec.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/codecs/table/BedTableCodec.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/table/BedTableCodec.java diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/table/TableCodec.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/table/TableCodec.java diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableFeature.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/table/TableFeature.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/codecs/table/TableFeature.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/codecs/table/TableFeature.java diff --git a/public/java/src/org/broadinstitute/sting/utils/collections/DefaultHashMap.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/collections/DefaultHashMap.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/collections/DefaultHashMap.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/collections/DefaultHashMap.java diff --git a/public/java/src/org/broadinstitute/sting/utils/collections/ExpandingArrayList.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/collections/ExpandingArrayList.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/collections/ExpandingArrayList.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/collections/ExpandingArrayList.java diff --git a/public/java/src/org/broadinstitute/sting/utils/collections/LoggingNestedIntegerArray.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/collections/LoggingNestedIntegerArray.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/collections/LoggingNestedIntegerArray.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/collections/LoggingNestedIntegerArray.java diff --git a/public/java/src/org/broadinstitute/sting/utils/collections/NestedIntegerArray.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/collections/NestedIntegerArray.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/collections/NestedIntegerArray.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/collections/NestedIntegerArray.java diff --git a/public/java/src/org/broadinstitute/sting/utils/collections/Pair.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/collections/Pair.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/collections/Pair.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/collections/Pair.java diff --git a/public/java/src/org/broadinstitute/sting/utils/collections/PrimitivePair.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/collections/PrimitivePair.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/collections/PrimitivePair.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/collections/PrimitivePair.java diff --git a/public/java/src/org/broadinstitute/sting/utils/collections/RODMergingIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/collections/RODMergingIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/collections/RODMergingIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/collections/RODMergingIterator.java diff --git a/public/java/src/org/broadinstitute/sting/utils/crypt/CryptUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/crypt/CryptUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/crypt/CryptUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/crypt/CryptUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/crypt/GATKKey.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/crypt/GATKKey.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/crypt/GATKKey.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/crypt/GATKKey.java diff --git a/public/java/src/org/broadinstitute/sting/utils/duplicates/DupUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/duplicates/DupUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/duplicates/DupUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/duplicates/DupUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/duplicates/DuplicateComp.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/duplicates/DuplicateComp.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/duplicates/DuplicateComp.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/duplicates/DuplicateComp.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/exceptions/DynamicClassResolutionException.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/exceptions/DynamicClassResolutionException.java new file mode 100644 index 000000000..0f1b473c3 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/exceptions/DynamicClassResolutionException.java @@ -0,0 +1,54 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.exceptions; + +import java.lang.reflect.InvocationTargetException; + +/** + * Class for handling common failures of dynamic class resolution + */ +public class DynamicClassResolutionException extends UserException { + public DynamicClassResolutionException(Class c, Exception ex) { + super(String.format("Could not create module %s because %s caused by exception %s", + c.getSimpleName(), moreInfo(ex), ex.getMessage())); + } + + private static String moreInfo(Exception ex) { + try { + throw ex; + } catch (InstantiationException e) { + return "BUG: cannot instantiate class: must be concrete class"; + } catch (NoSuchMethodException e) { + return "BUG: Cannot find expected constructor for class"; + } catch (IllegalAccessException e) { + return "Cannot instantiate class (Illegal Access)"; + } catch (InvocationTargetException e) { + return "Cannot instantiate class (Invocation failure)"; + } catch ( Exception e ) { + return String.format("an exception of type %s occurred",e.getClass().getSimpleName()); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/ReviewedStingException.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/exceptions/ReviewedStingException.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/exceptions/ReviewedStingException.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/exceptions/ReviewedStingException.java diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/StingException.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/exceptions/StingException.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/exceptions/StingException.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/exceptions/StingException.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/exceptions/UserException.java new file mode 100644 index 000000000..4db6e3d69 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -0,0 +1,485 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.exceptions; + +import net.sf.samtools.CigarOperator; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMSequenceDictionary; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.sting.utils.variant.GATKVCFIndexType; +import org.broadinstitute.variant.variantcontext.VariantContext; + +import java.io.File; + +/** + * Represents the common user errors detected by Sting / GATK + * + * Root class for all GATK user errors, as well as the container for errors themselves + */ +@DocumentedGATKFeature( + groupName = HelpConstants.DOCS_CAT_USRERR, + summary = "Errors caused by incorrect user behavior, such as bad files, bad arguments, etc." ) +public class UserException extends ReviewedStingException { + /** + * The URL where people can get help messages. Printed when an error occurs + */ + public static final String PHONE_HOME_DOCS_URL = "http://gatkforums.broadinstitute.org/discussion/1250/what-is-phone-home-and-how-does-it-affect-me#latest"; + + public UserException(String msg) { super(msg); } + public UserException(String msg, Throwable e) { super(msg, e); } + private UserException(Throwable e) { super("", e); } // cannot be called, private access + + protected static String getMessage(Throwable t) { + String message = t.getMessage(); + return message != null ? message : t.getClass().getName(); + } + + public static class CommandLineException extends UserException { + public CommandLineException(String message) { + super(String.format("Invalid command line: %s", message)); + } + } + + public static class MalformedReadFilterException extends CommandLineException { + public MalformedReadFilterException(String message) { + super(String.format("Malformed read filter: %s",message)); + } + } + + public static class IncompatibleReadFiltersException extends CommandLineException { + public IncompatibleReadFiltersException(final String filter1, final String filter2) { + super(String.format("Two read filters are enabled that are incompatible and cannot be used simultaneously: %s and %s", filter1, filter2)); + } + } + + public static class MalformedWalkerArgumentsException extends CommandLineException { + public MalformedWalkerArgumentsException(String message) { + super(String.format("Malformed walker argument: %s",message)); + } + } + + public static class UnsupportedCigarOperatorException extends UserException { + public UnsupportedCigarOperatorException(final CigarOperator co, final SAMRecord read, final String message) { + super(String.format( + "Unsupported CIGAR operator %s in read %s at %s:%d. %s", + co, + read.getReadName(), + read.getReferenceName(), + read.getAlignmentStart(), + message)); + } + } + + + public static class MalformedGenomeLoc extends UserException { + public MalformedGenomeLoc(String message, GenomeLoc loc) { + super(String.format("Badly formed genome loc: %s: %s", message, loc)); + } + + public MalformedGenomeLoc(String message) { + super(String.format("Badly formed genome loc: %s", message)); + } + } + + public static class BadInput extends UserException { + public BadInput(String message) { + super(String.format("Bad input: %s", message)); + } + } + + // todo -- fix up exception cause passing + public static class MissingArgument extends CommandLineException { + public MissingArgument(String arg, String message) { + super(String.format("Argument %s was missing: %s", arg, message)); + } + } + + public static class BadArgumentValue extends CommandLineException { + public BadArgumentValue(String arg, String message) { + super(String.format("Argument %s has a bad value: %s", arg, message)); + } + } + + public static class UnknownTribbleType extends CommandLineException { + public UnknownTribbleType(String type, String message) { + super(String.format("Unknown tribble type %s: %s", type, message)); + } + } + + + public static class BadTmpDir extends UserException { + public BadTmpDir(String message) { + super(String.format("Failure working with the tmp directory %s. Override with -Djava.io.tmpdir=X on the command line to a bigger/better file system. Exact error was %s", System.getProperties().get("java.io.tmpdir"), message)); + } + } + + public static class TooManyOpenFiles extends UserException { + public TooManyOpenFiles() { + super(String.format("There was a failure because there are too many files open concurrently; your system's open file handle limit is too small. See the unix ulimit command to adjust this limit")); + } + } + + public static class LocalParallelizationProblem extends UserException { + public LocalParallelizationProblem(final File file) { + super(String.format("There was a failure because temporary file %s could not be found while running the GATK with more than one thread. Possible causes for this problem include: your system's open file handle limit is too small, your output or temp directories do not have sufficient space, or just an isolated file system blip", file.getAbsolutePath())); + } + } + + public static class NotEnoughMemory extends UserException { + public NotEnoughMemory() { + super(String.format("There was a failure because you did not provide enough memory to run this program. See the -Xmx JVM argument to adjust the maximum heap size provided to Java")); + } + } + + public static class ErrorWritingBamFile extends UserException { + public ErrorWritingBamFile(String message) { + super(String.format("An error occurred when trying to write the BAM file. Usually this happens when there is not enough space in the directory to which the data is being written (generally the temp directory) or when your system's open file handle limit is too small. To tell Java to use a bigger/better file system use -Djava.io.tmpdir=X on the command line. The exact error was %s", message)); + } + } + + public static class NoSpaceOnDevice extends UserException { + public NoSpaceOnDevice() { + super("There is no space left on the device, so writing failed"); + } + } + + public static class CouldNotReadInputFile extends UserException { + public CouldNotReadInputFile(String message, Exception e) { + super(String.format("Couldn't read file because %s caused by %s", message, getMessage(e))); + } + + public CouldNotReadInputFile(File file) { + super(String.format("Couldn't read file %s", file.getAbsolutePath())); + } + + public CouldNotReadInputFile(File file, String message) { + super(String.format("Couldn't read file %s because %s", file.getAbsolutePath(), message)); + } + + public CouldNotReadInputFile(String file, String message) { + super(String.format("Couldn't read file %s because %s", file, message)); + } + + public CouldNotReadInputFile(File file, String message, Exception e) { + super(String.format("Couldn't read file %s because %s with exception %s", file.getAbsolutePath(), message, getMessage(e))); + } + + public CouldNotReadInputFile(File file, Exception e) { + this(file, getMessage(e)); + } + + public CouldNotReadInputFile(String message) { + super(message); + } + } + + + public static class CouldNotCreateOutputFile extends UserException { + public CouldNotCreateOutputFile(File file, String message, Exception e) { + super(String.format("Couldn't write file %s because %s with exception %s", file.getAbsolutePath(), message, getMessage(e))); + } + + public CouldNotCreateOutputFile(File file, String message) { + super(String.format("Couldn't write file %s because %s", file.getAbsolutePath(), message)); + } + + public CouldNotCreateOutputFile(String filename, String message, Exception e) { + super(String.format("Couldn't write file %s because %s with exception %s", filename, message, getMessage(e))); + } + + public CouldNotCreateOutputFile(File file, Exception e) { + super(String.format("Couldn't write file %s because exception %s", file.getAbsolutePath(), getMessage(e))); + } + + public CouldNotCreateOutputFile(String message, Exception e) { + super(message, e); + } + } + + public static class MissortedBAM extends UserException { + public MissortedBAM(SAMFileHeader.SortOrder order, File file, SAMFileHeader header) { + super(String.format("Missorted Input SAM/BAM files: %s is must be sorted in %s order but order was: %s", file, order, header.getSortOrder())); + } + + public MissortedBAM(SAMFileHeader.SortOrder order, String message) { + super(String.format("Missorted Input SAM/BAM files: files are not sorted in %s order; %s", order, message)); + } + + public MissortedBAM(SAMFileHeader.SortOrder order, SAMRecord read, String message) { + super(String.format("Missorted Input SAM/BAM file %s: file sorted in %s order but %s is required; %s", + read.getFileSource().getReader(), read.getHeader().getSortOrder(), order, message)); + } + + public MissortedBAM(String message) { + super(String.format("Missorted Input SAM/BAM files: %s", message)); + } + } + + public static class MalformedBAM extends UserException { + public MalformedBAM(SAMRecord read, String message) { + this(read.getFileSource() != null ? read.getFileSource().getReader().toString() : "(none)", message); + } + + public MalformedBAM(File file, String message) { + this(file.toString(), message); + } + + public MalformedBAM(String source, String message) { + super(String.format("SAM/BAM file %s is malformed: %s", source, message)); + } + } + + public static class MisencodedBAM extends UserException { + public MisencodedBAM(SAMRecord read, String message) { + this(read.getFileSource() != null ? read.getFileSource().getReader().toString() : "(none)", message); + } + + public MisencodedBAM(String source, String message) { + super(String.format("SAM/BAM file %s appears to be using the wrong encoding for quality scores: %s; please see the GATK --help documentation for options related to this error", source, message)); + } + } + + public static class MalformedVCF extends UserException { + public MalformedVCF(String message, String line) { + super(String.format("The provided VCF file is malformed at line %s: %s", line, message)); + } + + public MalformedVCF(String message) { + super(String.format("The provided VCF file is malformed: %s", message)); + } + + public MalformedVCF(String message, int lineNo) { + super(String.format("The provided VCF file is malformed at approximately line number %d: %s", lineNo, message)); + } + } + + public static class MalformedBCF2 extends UserException { + public MalformedBCF2( String message ) { + super(String.format("Malformed BCF2 file: %s", message)); + } + } + + public static class MalformedVCFHeader extends UserException { + public MalformedVCFHeader(String message) { + super(String.format("The provided VCF file has a malformed header: %s", message)); + } + } + + public static class ReadMissingReadGroup extends MalformedBAM { + public ReadMissingReadGroup(final SAMRecord read) { + super(read, String.format("Read %s is missing the read group (RG) tag, which is required by the GATK. Please use " + HelpConstants.forumPost("discussion/59/companion-utilities-replacereadgroups to fix this problem"), read.getReadName())); + } + } + + public static class ReadHasUndefinedReadGroup extends MalformedBAM { + public ReadHasUndefinedReadGroup(final SAMRecord read, final String rgID) { + super(read, String.format("Read %s uses a read group (%s) that is not defined in the BAM header, which is not valid. Please use " + HelpConstants.forumPost("discussion/59/companion-utilities-replacereadgroups to fix this problem"), read.getReadName(), rgID)); + } + } + + public static class VariantContextMissingRequiredField extends UserException { + public VariantContextMissingRequiredField(String field, VariantContext vc) { + super(String.format("Variant at %s:%d is is missing the required field %s", vc.getChr(), vc.getStart(), field)); + } + } + + public static class MissortedFile extends UserException { + public MissortedFile(File file, String message, Exception e) { + super(String.format("Missorted Input file: %s is must be sorted in coordinate order. %s and got error %s", file, message, getMessage(e))); + } + } + + public static class FailsStrictValidation extends UserException { + public FailsStrictValidation(File f, String message) { + super(String.format("File %s fails strict validation: %s", f.getAbsolutePath(), message)); + } + } + + public static class MalformedFile extends UserException { + public MalformedFile(String message) { + super(String.format("Unknown file is malformed: %s", message)); + } + + public MalformedFile(String message, Exception e) { + super(String.format("Unknown file is malformed: %s caused by %s", message, getMessage(e))); + } + + public MalformedFile(File f, String message) { + super(String.format("File %s is malformed: %s", f.getAbsolutePath(), message)); + } + + public MalformedFile(File f, String message, Exception e) { + super(String.format("File %s is malformed: %s caused by %s", f.getAbsolutePath(), message, getMessage(e))); + } + + public MalformedFile(String name, String message) { + super(String.format("File associated with name %s is malformed: %s", name, message)); + } + + public MalformedFile(String name, String message, Exception e) { + super(String.format("File associated with name %s is malformed: %s caused by %s", name, message, getMessage(e))); + } + } + + public static class CannotExecuteRScript extends UserException { + public CannotExecuteRScript(String message) { + super(String.format("Unable to execute RScript command: " + message)); + } + public CannotExecuteRScript(String message, Exception e) { + super(String.format("Unable to execute RScript command: " + message), e); + } + } + + public static class DeprecatedArgument extends CommandLineException { + public DeprecatedArgument(String param, String doc) { + super(String.format("The parameter %s is deprecated. %s",param,doc)); + } + } + + + public static class IncompatibleSequenceDictionaries extends UserException { + public IncompatibleSequenceDictionaries(String message, String name1, SAMSequenceDictionary dict1, String name2, SAMSequenceDictionary dict2) { + super(String.format("Input files %s and %s have incompatible contigs: %s.\n %s contigs = %s\n %s contigs = %s", + name1, name2, message, name1, ReadUtils.prettyPrintSequenceRecords(dict1), name2, ReadUtils.prettyPrintSequenceRecords(dict2))); + } + } + + public static class LexicographicallySortedSequenceDictionary extends UserException { + public LexicographicallySortedSequenceDictionary(String name, SAMSequenceDictionary dict) { + super(String.format("Lexicographically sorted human genome sequence detected in %s." + + "\nFor safety's sake the GATK requires human contigs in karyotypic order: 1, 2, ..., 10, 11, ..., 20, 21, 22, X, Y with M either leading or trailing these contigs." + + "\nThis is because all distributed GATK resources are sorted in karyotypic order, and your processing will fail when you need to use these files." + + "\nYou can use the ReorderSam utility to fix this problem: " + HelpConstants.forumPost("discussion/58/companion-utilities-reordersam") + + "\n %s contigs = %s", + name, name, ReadUtils.prettyPrintSequenceRecords(dict))); + } + } + + public static class DeprecatedWalker extends UserException { + public DeprecatedWalker(String walkerName, String version) { + super(String.format("Walker %s is no longer available in the GATK; it has been deprecated since version %s", walkerName, version)); + } + } + + public static class DeprecatedAnnotation extends UserException { + public DeprecatedAnnotation(String annotationName, String version) { + super(String.format("Annotation %s is no longer available in the GATK; it has been deprecated since version %s", annotationName, version)); + } + } + + public static class CannotExecuteQScript extends UserException { + public CannotExecuteQScript(String message) { + super(String.format("Unable to execute QScript: " + message)); + } + public CannotExecuteQScript(String message, Exception e) { + super(String.format("Unable to execute QScript: " + message), e); + } + } + + public static class CannotHandleGzippedRef extends UserException { + public CannotHandleGzippedRef() { + super("The GATK cannot process compressed (.gz) reference sequences. Please unzip the file and try again. Sorry for the inconvenience."); + } + } + + public static class MissingReferenceFaiFile extends UserException { + public MissingReferenceFaiFile( final File indexFile, final File fastaFile ) { + super(String.format("Fasta index file %s for reference %s does not exist. Please see %s for help creating it.", + indexFile.getAbsolutePath(), fastaFile.getAbsolutePath(), + HelpConstants.forumPost("discussion/1601/how-can-i-prepare-a-fasta-file-to-use-as-reference"))); + } + } + + public static class MissingReferenceDictFile extends UserException { + public MissingReferenceDictFile( final File dictFile, final File fastaFile ) { + super(String.format("Fasta dict file %s for reference %s does not exist. Please see %s for help creating it.", + dictFile.getAbsolutePath(), fastaFile.getAbsolutePath(), + HelpConstants.forumPost("discussion/1601/how-can-i-prepare-a-fasta-file-to-use-as-reference"))); + } + } + + public static class UnreadableKeyException extends UserException { + public UnreadableKeyException ( File f, Exception e ) { + super(String.format("Key file %s cannot be read (possibly the key file is corrupt?). Error was: %s. " + + "Please see %s for help.", + f.getAbsolutePath(), getMessage(e), PHONE_HOME_DOCS_URL)); + } + + public UnreadableKeyException ( String message, Exception e ) { + this(String.format("%s. Error was: %s", message, getMessage(e))); + } + + public UnreadableKeyException ( String message ) { + super(String.format("Key file cannot be read (possibly the key file is corrupt?): %s. " + + "Please see %s for help.", + message, PHONE_HOME_DOCS_URL)); + } + } + + public static class KeySignatureVerificationException extends UserException { + public KeySignatureVerificationException ( File f ) { + super(String.format("The signature in key file %s failed cryptographic verification. " + + "If this key was valid in the past, it's likely been revoked. " + + "Please see %s for help.", + f.getAbsolutePath(), PHONE_HOME_DOCS_URL)); + } + } + + public static class GVCFIndexException extends UserException { + public GVCFIndexException (GATKVCFIndexType indexType, int indexParameter) { + super(String.format("GVCF output requires a specific indexing strategy. Please re-run including the arguments " + + "-variant_index_type %s -variant_index_parameter %d.", + indexType, indexParameter)); + } + } + + /** + * A special exception that happens only in the case where + * the filesystem, by design or configuration, is completely unable + * to handle locking. This exception will specifically NOT be thrown + * in the case where the filesystem handles locking but is unable to + * acquire a lock due to concurrency. + */ + public static class FileSystemInabilityToLockException extends UserException { + public FileSystemInabilityToLockException( String message ) { + super(message); + } + + public FileSystemInabilityToLockException( String message, Exception innerException ) { + super(message,innerException); + } + } + + public static class IncompatibleRecalibrationTableParameters extends UserException { + public IncompatibleRecalibrationTableParameters(String s) { + super(s); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/fasta/ArtificialFastaUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/fasta/ArtificialFastaUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/fasta/ArtificialFastaUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/fasta/ArtificialFastaUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java diff --git a/public/java/src/org/broadinstitute/sting/utils/fasta/package-info.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/fasta/package-info.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/fasta/package-info.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/fasta/package-info.java diff --git a/public/java/src/org/broadinstitute/sting/utils/file/FSLockWithShared.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/file/FSLockWithShared.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/file/FSLockWithShared.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/file/FSLockWithShared.java diff --git a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentCollection.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/fragments/FragmentCollection.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/fragments/FragmentCollection.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/fragments/FragmentCollection.java diff --git a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/fragments/FragmentUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/fragments/FragmentUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/genotyper/DiploidGenotype.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/genotyper/DiploidGenotype.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/genotyper/DiploidGenotype.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/genotyper/DiploidGenotype.java diff --git a/public/java/src/org/broadinstitute/sting/utils/genotyper/MostLikelyAllele.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/genotyper/MostLikelyAllele.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/genotyper/MostLikelyAllele.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/genotyper/MostLikelyAllele.java diff --git a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotype/EventMap.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/haplotype/EventMap.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/haplotype/EventMap.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/haplotype/EventMap.java diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotype/Haplotype.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/haplotype/Haplotype.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/haplotype/Haplotype.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/haplotype/Haplotype.java diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeBaseComparator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/haplotype/HaplotypeBaseComparator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeBaseComparator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/haplotype/HaplotypeBaseComparator.java diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeScoreComparator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/haplotype/HaplotypeScoreComparator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeScoreComparator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/haplotype/HaplotypeScoreComparator.java diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeSizeAndBaseComparator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/haplotype/HaplotypeSizeAndBaseComparator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeSizeAndBaseComparator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/haplotype/HaplotypeSizeAndBaseComparator.java diff --git a/public/java/src/org/broadinstitute/sting/utils/help/ApplicationDetails.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/ApplicationDetails.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/help/ApplicationDetails.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/ApplicationDetails.java diff --git a/public/java/src/org/broadinstitute/sting/utils/help/DocletUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/DocletUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/help/DocletUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/DocletUtils.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/DocumentedGATKFeature.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/DocumentedGATKFeature.java new file mode 100644 index 000000000..0afcdae02 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/DocumentedGATKFeature.java @@ -0,0 +1,50 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.help; + +import java.lang.annotation.*; + +/** + * An annotation to identify a class as a GATK capability for documentation + * + * @author depristo + */ +@Documented +@Inherited +@Retention(RetentionPolicy.RUNTIME) +@Target(ElementType.TYPE) +public @interface DocumentedGATKFeature { + /** Should we actually document this feature, even though it's annotated? */ + public boolean enable() default true; + /** The overall group name (walkers, readfilters) this feature is associated with */ + public String groupName(); + /** A human readable summary of the purpose of this group of features */ + public String summary() default ""; + /** Are there links to other docs that we should include? CommandLineGATK.class for walkers, for example? */ + public Class[] extraDocs() default {}; + /** Who is the go-to developer for operation/documentation issues? */ + public String gotoDev() default "NA"; +} diff --git a/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureHandler.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureHandler.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureHandler.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureHandler.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureObject.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureObject.java new file mode 100644 index 000000000..ad0959bfe --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureObject.java @@ -0,0 +1,61 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.help; + +/** + * Documentation unit. Effectively a class version of the DocumentedGATKFeature. + * Immutable data structure. + * + * @author depristo + */ +class DocumentedGATKFeatureObject { + /** Which class are we documenting. Specific to each class being documented */ + private final Class classToDoc; + /** Are we enabled? */ + private final boolean enable; + private final String groupName, summary, gotoDev; + private final Class[] extraDocs; + + public DocumentedGATKFeatureObject(Class classToDoc, final boolean enable, final String groupName, final String summary, final Class[] extraDocs, final String gotoDev) { + this.classToDoc = classToDoc; + this.enable = enable; + this.groupName = groupName; + this.summary = summary; + this.extraDocs = extraDocs; + this.gotoDev = gotoDev; + } + + public DocumentedGATKFeatureObject(Class classToDoc, final String groupName, final String summary, final String gotoDev) { + this(classToDoc, true, groupName, summary, new Class[]{}, gotoDev); + } + + public Class getClassToDoc() { return classToDoc; } + public boolean enable() { return enable; } + public String groupName() { return groupName; } + public String summary() { return summary; } + public Class[] extraDocs() { return extraDocs; } + public String gotoDev() { return gotoDev; } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/help/ForumAPIUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/ForumAPIUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/help/ForumAPIUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/ForumAPIUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/help/ForumDiscussion.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/ForumDiscussion.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/help/ForumDiscussion.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/ForumDiscussion.java diff --git a/public/java/src/org/broadinstitute/sting/utils/help/GATKDocUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/GATKDocUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/help/GATKDocUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/GATKDocUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/help/GATKDocWorkUnit.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/GATKDocWorkUnit.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/help/GATKDocWorkUnit.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/GATKDocWorkUnit.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/GATKDoclet.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/GATKDoclet.java new file mode 100644 index 000000000..f0166bc9c --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/GATKDoclet.java @@ -0,0 +1,538 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.help; + +import com.sun.javadoc.ClassDoc; +import com.sun.javadoc.RootDoc; +import freemarker.template.Configuration; +import freemarker.template.DefaultObjectWrapper; +import freemarker.template.Template; +import freemarker.template.TemplateException; +import org.apache.commons.io.FileUtils; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.broad.tribble.FeatureCodec; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.walkers.qc.DocumentationTest; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.text.XReadLines; + +import java.io.*; +import java.util.*; + +/** + * Javadoc Doclet that combines javadoc, GATK ParsingEngine annotations, and FreeMarker + * templates to produce html formatted GATKDocs for walkers + * and other classes. + *

+ * This document has the following workflow: + *

+ * 1 -- walk the javadoc hierarchy, looking for class that have the + * DocumentedGATKFeature annotation or are in the type hierarchy in the + * static list of things to document, and are to be documented + * 2 -- construct for each a GATKDocWorkUnit, resulting in the complete + * set of things to document + * 3 -- for each unit, actually generate an html page documenting it + * as well as links to related features via their units. Writing + * of a specific class HTML is accomplished by a generate DocumentationHandler + * 4 -- write out an index of all units, organized by group + *

+ * The documented classes are restricted to only those with @DocumentedGATKFeature + * annotation or are in the STATIC_DOCS class. + */ +public class GATKDoclet { + final protected static Logger logger = Logger.getLogger(GATKDoclet.class); + + /** + * Where we find the help FreeMarker templates + */ + final protected static File SETTINGS_DIR = new File("settings/helpTemplates"); + + /** + * Where we write the GATKDoc html directory + */ + final protected static File DESTINATION_DIR = new File("gatkdocs"); + + final private static String FORUM_KEY_PATH = "/local/gsa-engineering/gatkdocs_publisher/forum.key"; + // ---------------------------------------------------------------------- + // + // Global variables that are set on the command line by javadoc + // + // ---------------------------------------------------------------------- + protected static File settingsDir = SETTINGS_DIR; + protected static File destinationDir = DESTINATION_DIR; + protected static String forumKeyPath = FORUM_KEY_PATH; + protected static String buildTimestamp = null, absoluteVersion = null; + protected static boolean showHiddenFeatures = false; + + protected static boolean testOnly = false; + + /** + * Any class that's in this list will be included in the documentation + * when the -test argument is provided. Useful for debugging. + */ + private static final List> testOnlyKeepers = Arrays.asList( + DocumentationTest.class, CommandLineGATK.class, UserException.class); + + /** + * The javadoc root doc + */ + RootDoc rootDoc; + + /** + * The set of all things we are going to document + */ + Set myWorkUnits; + + /** + * A static list of DocumentedGATKFeatureObjects. Any class that is as or extends + * one of the DocumentedGATKFeatureObjects.clazz of this collection will also + * be documented, even if it doesn't have the @DocumentedGATKFeature annotation. Useful + * when you want to document things that implement an interface (annotations on java + * interfaces aren't inherited) or whose base class isn't under your control (tribble + * codecs). + */ + final static Collection STATIC_DOCS = new ArrayList(); + + static { + STATIC_DOCS.add(new DocumentedGATKFeatureObject(FeatureCodec.class, + HelpConstants.DOCS_CAT_RODCODECS, + "Tribble codecs for reading reference ordered data (ROD) files such as VCF or BED", + "NA")); + } + + + /** + * Extracts the contents of certain types of javadoc and adds them to an XML file. + * + * @param rootDoc The documentation root. + * @return Whether the JavaDoc run succeeded. + * @throws java.io.IOException if output can't be written. + */ + public static boolean start(RootDoc rootDoc) throws IOException { + logger.setLevel(Level.INFO); + + // load arguments + for (String[] options : rootDoc.options()) { + if (options[0].equals("-settings-dir")) + settingsDir = new File(options[1]); + if (options[0].equals("-destination-dir")) + destinationDir = new File(options[1]); + if (options[0].equals("-forum-key-path")) + forumKeyPath = options[1]; + if (options[0].equals("-build-timestamp")) + buildTimestamp = options[1]; + if (options[0].equals("-absolute-version")) + absoluteVersion = options[1]; + if (options[0].equals("-include-hidden")) + showHiddenFeatures = true; + if (options[0].equals("-test")) + testOnly = true; + } + + if (!settingsDir.exists()) + throw new RuntimeException("-settings-dir " + settingsDir.getPath() + " does not exist"); + else if (!settingsDir.isDirectory()) + throw new RuntimeException("-settings-dir " + settingsDir.getPath() + " is not a directory"); + + // process the docs + new GATKDoclet().processDocs(rootDoc); + + + return true; + } + + /** + * Validate the given options against options supported by this doclet. + * + * @param option Option to validate. + * @return Number of potential parameters; 0 if not supported. + */ + public static int optionLength(String option) { + if (option.equals("-settings-dir") || + option.equals("-destination-dir") || + option.equals("-forum-key-path") || + option.equals("-build-timestamp") || + option.equals("-absolute-version") || + option.equals("-include-hidden")) { + return 2; + } else if (option.equals("-test")) + return 1; + else + return 0; + } + + /** + * Are we supposed to include @Hidden annotations in our documented output? + * + * @return + */ + public boolean showHiddenFeatures() { + return showHiddenFeatures; + } + + /** + * @param rootDoc + */ + private void processDocs(RootDoc rootDoc) { + // setup the global access to the root + this.rootDoc = rootDoc; + + try { + // basic setup + destinationDir.mkdirs(); + FileUtils.copyFile(new File(settingsDir + "/bootstrap.min.css"), new File(destinationDir + "/bootstrap.min.css")); + FileUtils.copyFile(new File(settingsDir + "/bootstrap.min.js"), new File(destinationDir + "/bootstrap.min.js")); + FileUtils.copyFile(new File(settingsDir + "/jquery.min.js"), new File(destinationDir + "/jquery.min.js")); + // print the Version number + FileUtils.writeByteArrayToFile(new File(destinationDir + "/current.version.txt"), getSimpleVersion(absoluteVersion).getBytes()); + + /* ------------------------------------------------------------------- */ + /* You should do this ONLY ONCE in the whole application life-cycle: */ + + Configuration cfg = new Configuration(); + // Specify the data source where the template files come from. + cfg.setDirectoryForTemplateLoading(settingsDir); + // Specify how templates will see the data-model. This is an advanced topic... + cfg.setObjectWrapper(new DefaultObjectWrapper()); + + myWorkUnits = computeWorkUnits(); + + List> groups = new ArrayList>(); + Set seenDocumentationFeatures = new HashSet(); + List> data = new ArrayList>(); + for (GATKDocWorkUnit workUnit : myWorkUnits) { + data.add(workUnit.indexDataMap()); + if (!seenDocumentationFeatures.contains(workUnit.annotation.groupName())) { + groups.add(toMap(workUnit.annotation)); + seenDocumentationFeatures.add(workUnit.annotation.groupName()); + } + } + + for (GATKDocWorkUnit workUnit : myWorkUnits) { + processDocWorkUnit(cfg, workUnit, groups, data); + } + + processIndex(cfg, new ArrayList(myWorkUnits)); + + File forumKeyFile = new File(forumKeyPath); + if (forumKeyFile.exists()) { + String forumKey = null; + // Read in a one-line file so we can do a for loop + for (String line : new XReadLines(forumKeyFile)) + forumKey = line; + updateForum(myWorkUnits, forumKey); + } + } catch (FileNotFoundException e) { + throw new RuntimeException(e); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private void updateForum(Set docWorkUnits, String forumKey) { + //first get list of posts that need to be added + List old = ForumAPIUtils.getPostedTools(forumKey); + + for (String s : old) + System.out.println(s); + + System.out.printf("Forum has %d items%n", old.size()); + System.out.printf("Docs have %d items%n", docWorkUnits.size()); + + List toAdd = new ArrayList(); + for (GATKDocWorkUnit tool : docWorkUnits) { + if (!old.contains(tool.name)) { + System.out.println("WILL POST: " + tool.name + " TO FORUM"); + toAdd.add(tool); + } + } + + //update using list + for (GATKDocWorkUnit tool : toAdd) { + //if ( tool.name.equals("ApplyRecalibration") ) + ForumAPIUtils.postToForum(tool, forumKey); + } + } + + /** + * Returns the set of all GATKDocWorkUnits that we are going to generate docs for. + * + * @return + */ + private Set computeWorkUnits() { + TreeSet m = new TreeSet(); + + for (ClassDoc doc : rootDoc.classes()) { + //logger.debug("Considering " + doc); + Class clazz = getClassForClassDoc(doc); + + // don't add anything that's not DocumentationTest if we are in test mode + if (clazz != null && testOnly && !testOnlyKeepers.contains(clazz)) + continue; + + //if ( clazz != null && clazz.getName().equals("org.broadinstitute.sting.gatk.walkers.annotator.AlleleBalance")) + // logger.debug("foo"); + + DocumentedGATKFeatureObject feature = getFeatureForClassDoc(doc); + DocumentedGATKFeatureHandler handler = createHandler(doc, feature); + if (handler != null && handler.includeInDocs(doc)) { + //logger.info("Generating documentation for class " + doc); + String filename = handler.getDestinationFilename(doc, clazz); + GATKDocWorkUnit unit = new GATKDocWorkUnit(doc.name(), + filename, feature.groupName(), feature, handler, doc, clazz, + buildTimestamp, absoluteVersion); + m.add(unit); + } + } + + return m; + } + + /** + * Create a handler capable of documenting the class doc according to feature. Returns + * null if no appropriate handler is found or doc shouldn't be documented at all. + * + * @param doc + * @param feature + * @return + */ + private DocumentedGATKFeatureHandler createHandler(ClassDoc doc, DocumentedGATKFeatureObject feature) { + if (feature != null) { + if (feature.enable()) { + DocumentedGATKFeatureHandler handler = new GenericDocumentationHandler(); + handler.setDoclet(this); + return handler; + } else { + logger.info("Skipping disabled Documentation for " + doc); + } + } + + return null; + } + + /** + * Returns the instantiated DocumentedGATKFeatureObject that describes the GATKDoc + * structure we will apply to Doc. + * + * @param doc + * @return null if this proves inappropriate or doc shouldn't be documented + */ + private DocumentedGATKFeatureObject getFeatureForClassDoc(ClassDoc doc) { + Class docClass = getClassForClassDoc(doc); + + if (docClass == null) + return null; // not annotated so it shouldn't be documented + + if (docClass.isAnnotationPresent(DocumentedGATKFeature.class)) { + DocumentedGATKFeature f = docClass.getAnnotation(DocumentedGATKFeature.class); + return new DocumentedGATKFeatureObject(docClass, f.enable(), f.groupName(), f.summary(), f.extraDocs(), f.gotoDev()); + } else { + for (DocumentedGATKFeatureObject staticDocs : STATIC_DOCS) { + if (staticDocs.getClassToDoc().isAssignableFrom(docClass)) { + return new DocumentedGATKFeatureObject(docClass, staticDocs.enable(), staticDocs.groupName(), staticDocs.summary(), staticDocs.extraDocs(), staticDocs.gotoDev()); + } + } + return null; + } + } + + /** + * Return the Java class described by the ClassDoc doc + * + * @param doc + * @return + */ + private Class getClassForClassDoc(ClassDoc doc) { + try { + // todo -- what do I need the ? extends Object to pass the compiler? + return (Class) DocletUtils.getClassForDoc(doc); + } catch (ClassNotFoundException e) { + //logger.warn("Couldn't find class for ClassDoc " + doc); + // we got a classdoc for a class we can't find. Maybe in a library or something + return null; + } catch (NoClassDefFoundError e) { + return null; + } catch (UnsatisfiedLinkError e) { + return null; // naughty BWA bindings + } + } + + /** + * Create the html index listing all of the GATKDocs features + * + * @param cfg + * @param indexData + * @throws IOException + */ + private void processIndex(Configuration cfg, List indexData) throws IOException { + /* Get or create a template */ + Template temp = cfg.getTemplate("generic.index.template.html"); + + /* Merge data-model with template */ + Writer out = new OutputStreamWriter(new FileOutputStream(new File(destinationDir + "/index.html"))); + try { + temp.process(groupIndexData(indexData), out); + out.flush(); + } catch (TemplateException e) { + throw new ReviewedStingException("Failed to create GATK documentation", e); + } + } + + /** + * Helpful function to create the html index. Given all of the already run GATKDocWorkUnits, + * create the high-level grouping data listing individual features by group. + * + * @param indexData + * @return + */ + private Map groupIndexData(List indexData) { + // + // root -> data -> { summary -> y, filename -> z }, etc + // -> groups -> group1, group2, etc. + Map root = new HashMap(); + + Collections.sort(indexData); + + List> groups = new ArrayList>(); + Set seenDocumentationFeatures = new HashSet(); + List> data = new ArrayList>(); + for (GATKDocWorkUnit workUnit : indexData) { + data.add(workUnit.indexDataMap()); + if (!seenDocumentationFeatures.contains(workUnit.annotation.groupName())) { + groups.add(toMap(workUnit.annotation)); + seenDocumentationFeatures.add(workUnit.annotation.groupName()); + } + } + + //System.out.printf(groups.toString()); + + root.put("data", data); + root.put("groups", groups); + root.put("timestamp", buildTimestamp); + root.put("version", absoluteVersion); + + return root; + } + + /** + * Trivial helper routine that returns the map of name and summary given the annotation + * AND adds a super-category so that we can custom-order the categories in the index + * + * @param annotation + * @return + */ + private static final Map toMap(DocumentedGATKFeatureObject annotation) { + Map root = new HashMap(); + root.put("id", annotation.groupName().replaceAll("\\W", "")); + root.put("name", annotation.groupName()); + root.put("summary", annotation.summary()); + + /** + * Add-on super-category definitions. The assignments depend on parsing the names + * defined in HelpConstants.java so be careful of changing anything. + * Also, the super-category value strings need to be the same as used in the + * Freemarker template. This is all fairly clunky but the best I could do without + * making major changes to the DocumentedGATKFeatureObject. Doesn't help that + * Freemarker makes any scripting horribly awkward. + */ + final String supercatValue; + if (annotation.groupName().endsWith(" Tools")) supercatValue = "tools"; + else if (annotation.groupName().endsWith(" Utilities")) supercatValue = "utilities"; + else if (annotation.groupName().startsWith("Engine ")) supercatValue = "engine"; + else if (annotation.groupName().endsWith(" (DevZone)")) supercatValue = "dev"; + else supercatValue = "other"; + + root.put("supercat", supercatValue); + + return root; + } + + /** + * Helper function that finding the GATKDocWorkUnit associated with class from among all of the work units + * + * @param c the class we are looking for + * @return the GATKDocWorkUnit whose .clazz.equals(c), or null if none could be found + */ + public final GATKDocWorkUnit findWorkUnitForClass(Class c) { + for (final GATKDocWorkUnit unit : this.myWorkUnits) + if (unit.clazz.equals(c)) + return unit; + return null; + } + + /** + * Return the ClassDoc associated with clazz + * + * @param clazz + * @return + */ + public ClassDoc getClassDocForClass(Class clazz) { + return rootDoc.classNamed(clazz.getName()); + } + + /** + * High-level function that processes a single DocWorkUnit unit using its handler + * + * @param cfg + * @param unit + * @param data + * @throws IOException + */ + private void processDocWorkUnit(Configuration cfg, GATKDocWorkUnit unit, List> groups, List> data) + throws IOException { + //System.out.printf("Processing documentation for class %s%n", unit.classDoc); + + unit.handler.processOne(unit); + unit.forTemplate.put("groups", groups); + unit.forTemplate.put("data", data); + // Get or create a template + Template temp = cfg.getTemplate(unit.handler.getTemplateName(unit.classDoc)); + + // Merge data-model with template + File outputPath = new File(destinationDir + "/" + unit.filename); + try { + Writer out = new OutputStreamWriter(new FileOutputStream(outputPath)); + temp.process(unit.forTemplate, out); + out.flush(); + } catch (TemplateException e) { + throw new ReviewedStingException("Failed to create GATK documentation", e); + } + } + + private static String getSimpleVersion(String absoluteVersion) { + String[] parts = absoluteVersion.split("-"); + + // by skipping i=0, there is no trailing separator + for (int i = 1; i < 2; i++) { + parts[0] = parts[0].concat("-"); + parts[0] = parts[0].concat(parts[i]); + } + + return parts[0]; + } +} diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java new file mode 100644 index 000000000..06c0e1c26 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java @@ -0,0 +1,934 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.help; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import com.sun.javadoc.ClassDoc; +import com.sun.javadoc.FieldDoc; +import com.sun.javadoc.Tag; +import org.apache.commons.lang.StringUtils; +import org.apache.log4j.Logger; +import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.classloader.JVMUtils; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.StingException; + +import java.io.IOException; +import java.lang.annotation.Annotation; +import java.lang.reflect.*; +import java.util.*; + +/** + * + */ +public class GenericDocumentationHandler extends DocumentedGATKFeatureHandler { + private static Logger logger = Logger.getLogger(GenericDocumentationHandler.class); + + /** + * The max. length of the longest of --fullName -shortName argument name + * before we prefer the shorter option. + */ + private static final int MAX_DISPLAY_NAME = 30; + + /** + * The Class we are documenting + */ + private GATKDocWorkUnit toProcess; + + @Override + public boolean includeInDocs(ClassDoc doc) { + try { + Class type = DocletUtils.getClassForDoc(doc); + boolean hidden = !getDoclet().showHiddenFeatures() && type.isAnnotationPresent(Hidden.class); + return !hidden && JVMUtils.isConcrete(type); + } catch (ClassNotFoundException e) { + return false; + } + } + + + @Override + public String getTemplateName(ClassDoc doc) throws IOException { + return "generic.template.html"; + } + + @Override + public void processOne(GATKDocWorkUnit toProcessArg) { + this.toProcess = toProcessArg; + + //System.out.printf("%s class %s%n", toProcess.group, toProcess.classDoc); + Map root = new HashMap(); + + addHighLevelBindings(root); + addArgumentBindings(root); + addRelatedBindings(root); + root.put("group", toProcess.group); + + // Adding in retrieval of peripheral info (rf annotations etc) + getClazzAnnotations(toProcess.clazz, root); + + toProcess.setHandlerContent((String) root.get("summary"), root); + } + + /** + * Add high-level summary information about toProcess to root, such as its + * name, summary, description, version, etc. + * + * @param root + */ + protected void addHighLevelBindings(Map root) { + root.put("name", toProcess.classDoc.name()); + + // Extract overrides from the doc tags. + StringBuilder summaryBuilder = new StringBuilder(); + for (Tag tag : toProcess.classDoc.firstSentenceTags()) + summaryBuilder.append(tag.text()); + root.put("summary", summaryBuilder.toString()); + root.put("description", toProcess.classDoc.commentText().substring(summaryBuilder.toString().length())); + root.put("timestamp", toProcess.buildTimestamp); + root.put("version", toProcess.absoluteVersion); + + for (Tag tag : toProcess.classDoc.tags()) { + root.put(tag.name(), tag.text()); + } + + root.put("gotoDev", toProcess.annotation.gotoDev()); + } + + /** + * Add bindings describing related GATK capabilites to toProcess + * + * @param root + */ + protected void addRelatedBindings(Map root) { + List> extraDocsData = new ArrayList>(); + + // add in all of the explicitly related items + for (final Class extraDocClass : toProcess.annotation.extraDocs()) { + final GATKDocWorkUnit otherUnit = getDoclet().findWorkUnitForClass(extraDocClass); + if (otherUnit == null) + throw new ReviewedStingException("Requested extraDocs for class without any documentation: " + extraDocClass); + extraDocsData.add( + new HashMap() {{ + put("filename", otherUnit.filename); + put("name", otherUnit.name); + }}); + } + root.put("extradocs", extraDocsData); + } + + /** + * Add information about all of the arguments available to toProcess to root + * + * @param root + */ + protected void addArgumentBindings(Map root) { + ParsingEngine parsingEngine = createStandardGATKParsingEngine(); + + Map>> args = createArgumentMap(); + root.put("arguments", args); + try { + // loop over all of the arguments according to the parsing engine + for (final ArgumentSource argumentSource : parsingEngine.extractArgumentSources(DocletUtils.getClassForDoc(toProcess.classDoc))) { + ArgumentDefinition argDef = argumentSource.createArgumentDefinitions().get(0); + FieldDoc fieldDoc = getFieldDoc(toProcess.classDoc, argumentSource.field.getName()); + Map argBindings = docForArgument(fieldDoc, argumentSource, argDef); + if (!argumentSource.isHidden() || getDoclet().showHiddenFeatures()) { + final String kind = docKindOfArg(argumentSource); + // Retrieve default value + final Object value = argumentValue(toProcess.clazz, argumentSource); + if (value != null) + argBindings.put("defaultValue", prettyPrintValueString(value)); + // Retrieve min and max / hard and soft value thresholds for numeric args + if (value instanceof Number) { + if (argumentSource.field.isAnnotationPresent(Argument.class)) { + argBindings.put("minValue", argumentSource.field.getAnnotation(Argument.class).minValue()); + argBindings.put("maxValue", argumentSource.field.getAnnotation(Argument.class).maxValue()); + if (argumentSource.field.getAnnotation(Argument.class).minRecommendedValue() != Double.NEGATIVE_INFINITY) { + argBindings.put("minRecValue", argumentSource.field.getAnnotation(Argument.class).minRecommendedValue()); + } + if (argumentSource.field.getAnnotation(Argument.class).maxRecommendedValue() != Double.POSITIVE_INFINITY) { + argBindings.put("maxRecValue", argumentSource.field.getAnnotation(Argument.class).maxRecommendedValue()); + } + } + } + // Finalize argument bindings + args.get(kind).add(argBindings); + args.get("all").add(argBindings); + } + } + + // sort the arguments + for (Map.Entry>> entry : args.entrySet()) { + entry.setValue(sortArguments(entry.getValue())); + } + } catch (ClassNotFoundException e) { + throw new RuntimeException(e); + } + } + + /** + * Return the argument kind (required, advanced, hidden, etc) of this argumentSource + * + * @param argumentSource + * @return + */ + @Requires("argumentSource != null") + @Ensures("result != null") + private String docKindOfArg(ArgumentSource argumentSource) { + if (argumentSource.isRequired()) { + if (argumentSource.isInput()) return "required_in"; + else if (argumentSource.isOutput()) return "required_out"; + else if (argumentSource.isFlag()) return "required_flag"; + else return "required_param"; + } + else if (argumentSource.isAdvanced()) { + if (argumentSource.isInput()) return "advanced_in"; + else if (argumentSource.isOutput()) return "advanced_out"; + else if (argumentSource.isFlag()) return "advanced_flag"; + else return "advanced_param"; + } + else if (argumentSource.isHidden()) return "hidden"; + else if (argumentSource.isDeprecated()) return "deprecated"; + else { + if (argumentSource.isInput()) return "optional_in"; + else if (argumentSource.isOutput()) return "optional_out"; + else if (argumentSource.isFlag()) return "optional_flag"; + else return "optional_param"; + } + } + + /** + * Attempts to determine the value of argumentSource in an instantiated version of c + * + * @param c + * @param argumentSource + * @return value of argumentSource, or null if this isn't possible + */ + @Requires({"c != null", "argumentSource != null"}) + private Object argumentValue(Class c, ArgumentSource argumentSource) { + // get the value of the field + // attempt to instantiate the class + final Object instance = makeInstanceIfPossible(toProcess.clazz); + if (instance != null) { + final Object value = getFieldValue(instance, argumentSource.field.getName()); + if (value != null) + return value; + + if (argumentSource.createsTypeDefault()) { + try { // handle the case where there's an implicit default + return argumentSource.typeDefaultDocString(); + } catch (ReviewedStingException e) { + ; // failed to create type default, don't worry about it + } + } + } + + return null; + } + + /** + * Create the argument map for holding class arguments + * + * @return + */ + private Map>> createArgumentMap() { + Map>> args = new HashMap>>(); + args.put("all", new ArrayList>()); + args.put("required_in", new ArrayList>()); + args.put("required_out", new ArrayList>()); + args.put("required_param", new ArrayList>()); + args.put("required_flag", new ArrayList>()); + args.put("optional_in", new ArrayList>()); + args.put("optional_out", new ArrayList>()); + args.put("optional_param", new ArrayList>()); + args.put("optional_flag", new ArrayList>()); + args.put("advanced_in", new ArrayList>()); + args.put("advanced_out", new ArrayList>()); + args.put("advanced_param", new ArrayList>()); + args.put("advanced_flag", new ArrayList>()); + args.put("hidden", new ArrayList>()); + args.put("deprecated", new ArrayList>()); + return args; + } + + + /** + * Sorts the individual argument list in unsorted according to CompareArgumentsByName + * + * @param unsorted + * @return + */ + private List> sortArguments(List> unsorted) { + Collections.sort(unsorted, new CompareArgumentsByName()); + return unsorted; + } + + /** + * Sort arguments by case-insensitive comparison ignoring the -- and - prefixes + */ + private class CompareArgumentsByName implements Comparator> { + public int compare(Map x, Map y) { + return elt(x).compareTo(elt(y)); + } + + private String elt(Map m) { + String v = m.get("name").toString().toLowerCase(); + if (v.startsWith("--")) + return v.substring(2); + else if (v.startsWith("-")) + return v.substring(1); + else + throw new RuntimeException("Expect to see arguments beginning with at least one -, but found " + v); + } + } + + /** + * Umbrella function that groups the collection of values for specific annotations applied to an + * instance of class c. Lists of collected values are added directly to the "toProcess" object. + * Requires being able to instantiate the class. + * + * @param classToProcess the object to instantiate and query for the annotation + * @param root the root of the document handler, to which we'll store collected annotations + */ + private void getClazzAnnotations(Class classToProcess, Map root) { + // + // attempt to instantiate the class + final Object instance = makeInstanceIfPossible(classToProcess); + if (instance != null) { + final Class myClass = instance.getClass(); + // Get parallelism options + final HashSet> parallelOptions = getParallelism(myClass, new HashSet>()); + root.put("parallel", parallelOptions); + // Get annotation info (what type of annotation, standard etc.) + final HashSet annotInfo = getAnnotInfo(myClass, new HashSet()); + root.put("annotinfo", StringUtils.join(annotInfo, ", ")); + // Get annotation field (whether it goes in INFO or FORMAT) + root.put("annotfield", getAnnotField(myClass)); + // Get walker type if applicable + root.put("walkertype", getWalkerType(myClass)); + // Get partition type if applicable + root.put("partitiontype", getPartitionType(myClass)); + // Get read filter annotations (ReadFilters) if applicable + final HashSet> bucket= getReadFilters(myClass, new HashSet>()); + root.put("readfilters", bucket); + // Get default downsampling settings + final HashMap dsSettings = getDownSamplingSettings(myClass, new HashMap()); + root.put("downsampling", dsSettings); + // Get reference window size settings + final HashMap refwindow = getRefWindow(myClass, new HashMap()); + root.put("refwindow", refwindow); + // Get ActiveRegion size settings + final HashMap activeRegion = getActiveRegion(myClass, new HashMap()); + root.put("activeregion", activeRegion); + // anything else? + } else { + // put empty items to avoid blowups + root.put("parallel", new HashSet()); + root.put("annotinfo", ""); + root.put("annotfield", ""); + root.put("walkertype", ""); + root.put("partitiontype", ""); + root.put("readfilters", new HashSet>()); + root.put("downsampling", new HashMap()); + root.put("refwindow", new HashMap()); + root.put("activeregion", new HashMap()); + } + } + + /** + * Utility function that checks which parallelism options are available for an instance of class c. + * + * @param myClass the class to query for the interfaces + * @param parallelOptions an empty HashSet in which to collect the info + * @return a hash set of parallelism options, otherwise an empty set + */ + private HashSet> getParallelism(Class myClass, HashSet> parallelOptions) { + // + // Retrieve interfaces + Class[] implementedInterfaces = myClass.getInterfaces(); + for (Class intfClass : implementedInterfaces) { + final HashMap nugget = new HashMap(); + if (intfClass.getSimpleName().equals("TreeReducible")) { + nugget.put("name", intfClass.getSimpleName()); + nugget.put("arg", HelpConstants.ARG_TREEREDUCIBLE); + nugget.put("link", HelpConstants.CMDLINE_GATK_URL + "#" + HelpConstants.ARG_TREEREDUCIBLE); + } else if (intfClass.getSimpleName().equals("NanoSchedulable")) { + nugget.put("name", intfClass.getSimpleName()); + nugget.put("arg", HelpConstants.ARG_NANOSCHEDULABLE); + nugget.put("link", HelpConstants.CMDLINE_GATK_URL + "#" + HelpConstants.ARG_NANOSCHEDULABLE); + } else { + continue; + } + parallelOptions.add(nugget); + } + // Look up superclasses recursively + final Class mySuperClass = myClass.getSuperclass(); + if (mySuperClass.getSimpleName().equals("Object")) { + return parallelOptions; + } + return getParallelism(mySuperClass, parallelOptions); + } + + /** + * Utility function that looks up whether the annotation goes in INFO or FORMAT field. + * + * @param myClass the class to query for the interfaces + * @return a String specifying the annotation field + */ + private final String getAnnotField(Class myClass) { + // + // Look up superclasses recursively until we find either + // GenotypeAnnotation or InfoFieldAnnotation + final Class mySuperClass = myClass.getSuperclass(); + if (mySuperClass == InfoFieldAnnotation.class) { + return "INFO (variant-level)"; + } else if (mySuperClass == GenotypeAnnotation.class) { + return "FORMAT (sample genotype-level)"; + } else if (mySuperClass.getSimpleName().equals("Object")) { + return ""; + } + return getAnnotField(mySuperClass); + } + + /** + * Utility function that determines the annotation type for an instance of class c. + * + * @param myClass the class to query for the interfaces + * @param annotInfo an empty HashSet in which to collect the info + * @return a hash set of the annotation types, otherwise an empty set + */ + private HashSet getAnnotInfo(Class myClass, HashSet annotInfo) { + // + // Retrieve interfaces + Class[] implementedInterfaces = myClass.getInterfaces(); + for (Class intfClass : implementedInterfaces) { + if (intfClass.getName().contains("Annotation")) { + annotInfo.add(intfClass.getSimpleName()); + } + } + // Look up superclasses recursively + final Class mySuperClass = myClass.getSuperclass(); + if (mySuperClass.getSimpleName().equals("Object")) { + return annotInfo; + } + return getAnnotInfo(mySuperClass, annotInfo); + } + + /** + * Utility function that determines the default downsampling settings for an instance of class c. + * + * @param myClass the class to query for the settings + * @param dsSettings an empty HashMap in which to collect the info + * @return a hash set of the downsampling settings, otherwise an empty set + */ + private HashMap getDownSamplingSettings(Class myClass, HashMap dsSettings) { + // + // Retrieve annotation + if (myClass.isAnnotationPresent(Downsample.class)) { + final Annotation thisAnnotation = myClass.getAnnotation(Downsample.class); + if(thisAnnotation instanceof Downsample) { + final Downsample dsAnnotation = (Downsample) thisAnnotation; + dsSettings.put("by", dsAnnotation.by().toString()); + dsSettings.put("to_cov", dsAnnotation.toCoverage()); + } + } + return dsSettings; + } + + /** + * Utility function that determines the reference window size for an instance of class c. + * + * @param myClass the class to query for the settings + * @param refWindow an empty HashMap in which to collect the info + * @return a HashMap of the window start and stop, otherwise an empty HashMap + */ + private HashMap getRefWindow(Class myClass, HashMap refWindow) { + // + // Retrieve annotation + if (myClass.isAnnotationPresent(Reference.class)) { + final Annotation thisAnnotation = myClass.getAnnotation(Reference.class); + if(thisAnnotation instanceof Reference) { + final Reference refAnnotation = (Reference) thisAnnotation; + refWindow.put("start", refAnnotation.window().start()); + refWindow.put("stop", refAnnotation.window().stop()); + } + } + return refWindow; + } + + /** + * Utility function that determines the ActiveRegion settings for an instance of class c. + * + * @param myClass the class to query for the settings + * @param activeRegion an empty HashMap in which to collect the info + * @return a HashMap of the ActiveRegion parameters, otherwise an empty HashMap + */ + private HashMap getActiveRegion(Class myClass, HashMap activeRegion) { + // + // Retrieve annotation + if (myClass.isAnnotationPresent(ActiveRegionTraversalParameters.class)) { + final Annotation thisAnnotation = myClass.getAnnotation(ActiveRegionTraversalParameters.class); + if(thisAnnotation instanceof ActiveRegionTraversalParameters) { + final ActiveRegionTraversalParameters arAnnotation = (ActiveRegionTraversalParameters) thisAnnotation; + activeRegion.put("ext", arAnnotation.extension()); + activeRegion.put("max", arAnnotation.maxRegion()); + activeRegion.put("min", arAnnotation.minRegion()); + } + } + return activeRegion; + } + + /** + * Utility function that determines the partition type of an instance of class c. + * + * @param myClass the class to query for the annotation + * @return the partition type if applicable, otherwise an empty string + */ + private String getPartitionType(Class myClass) { + // + // Retrieve annotation + if (myClass.isAnnotationPresent(PartitionBy.class)) { + final Annotation thisAnnotation = myClass.getAnnotation(PartitionBy.class); + if(thisAnnotation instanceof PartitionBy) { + final PartitionBy partAnnotation = (PartitionBy) thisAnnotation; + return partAnnotation.value().toString(); + } + } + return ""; + } + + /** + * Utility function that determines the type of walker subclassed by an instance of class c. + * + * @param myClass the class to query for the annotation + * @return the type of walker if applicable, otherwise an empty string + */ + private String getWalkerType(Class myClass) { + // + // Look up superclasses recursively until we find either Walker or Object + final Class mySuperClass = myClass.getSuperclass(); + if (mySuperClass.getSimpleName().equals("Walker")) { + return myClass.getSimpleName(); + } else if (mySuperClass.getSimpleName().equals("Object")) { + return ""; + } + return getWalkerType(mySuperClass); + } + + /** + * Utility function that finds the values of ReadFilters annotation applied to an instance of class c. + * + * @param myClass the class to query for the annotation + * @param bucket a container in which we store the annotations collected + * @return a hash set of values, otherwise an empty set + */ + private HashSet> getReadFilters(Class myClass, HashSet> bucket) { + // + // Retrieve annotation + if (myClass.isAnnotationPresent(ReadFilters.class)) { + final Annotation thisAnnotation = myClass.getAnnotation(ReadFilters.class); + if(thisAnnotation instanceof ReadFilters) { + final ReadFilters rfAnnotation = (ReadFilters) thisAnnotation; + for (Class filter : rfAnnotation.value()) { + // make hashmap of simplename and url + final HashMap nugget = new HashMap(); + nugget.put("name", filter.getSimpleName()); + nugget.put("filename", GATKDocUtils.htmlFilenameForClass(filter)); + bucket.add(nugget); + } + } + } + // Look up superclasses recursively + final Class mySuperClass = myClass.getSuperclass(); + if (mySuperClass.getSimpleName().equals("Object")) { + return bucket; + } + return getReadFilters(mySuperClass, bucket); + } + + + /** + * Utility function that finds the value of fieldName in any fields of ArgumentCollection fields in + * instance of class c. + * + * @param instance the object to query for the field value + * @param fieldName the name of the field we are looking for in instance + * @return The value assigned to field in the ArgumentCollection, otherwise null + */ + private Object getFieldValue(Object instance, String fieldName) { + // + // subtle note. If you have a field named X that is an ArgumentCollection that + // contains a field X as well, you need only consider fields in the argumentCollection, not + // matching the argument itself. + // + // @ArgumentCollection + // protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); + // + + for (Field field : JVMUtils.getAllFields(instance.getClass())) { + if (field.isAnnotationPresent(ArgumentCollection.class)) { + //System.out.printf("Searching for %s in argument collection field %s%n", fieldName, field); + Object fieldValue = JVMUtils.getFieldValue(field, instance); + Object value = getFieldValue(fieldValue, fieldName); + if (value != null) + return value; + } else if (field.getName().equals(fieldName)) { + return JVMUtils.getFieldValue(field, instance); + } + } + + return null; + } + + /** + * Pretty prints value + *

+ * Assumes value != null + * + * @param value + * @return + */ + private Object prettyPrintValueString(Object value) { + if (value.getClass().isArray()) { + Class type = value.getClass().getComponentType(); + if (boolean.class.isAssignableFrom(type)) + return Arrays.toString((boolean[]) value); + if (byte.class.isAssignableFrom(type)) + return Arrays.toString((byte[]) value); + if (char.class.isAssignableFrom(type)) + return Arrays.toString((char[]) value); + if (double.class.isAssignableFrom(type)) + return Arrays.toString((double[]) value); + if (float.class.isAssignableFrom(type)) + return Arrays.toString((float[]) value); + if (int.class.isAssignableFrom(type)) + return Arrays.toString((int[]) value); + if (long.class.isAssignableFrom(type)) + return Arrays.toString((long[]) value); + if (short.class.isAssignableFrom(type)) + return Arrays.toString((short[]) value); + if (Object.class.isAssignableFrom(type)) + return Arrays.toString((Object[]) value); + else + throw new RuntimeException("Unexpected array type in prettyPrintValue. Value was " + value + " type is " + type); + } else if (RodBinding.class.isAssignableFrom(value.getClass())) { + // annoying special case to handle the UnBound() constructor + return "none"; + } else if (value instanceof String) { + return value.equals("") ? "\"\"" : value; + } else { + return value.toString(); + } + } + + /** + * Attempt to instantiate class c, if possible. Returns null if this proves impossible. + * + * @param c + * @return + */ + private Object makeInstanceIfPossible(Class c) { + Object instance = null; + try { + // don't try to make something where we will obviously fail + if (!c.isEnum() && !c.isAnnotation() && !c.isAnonymousClass() && + !c.isArray() && !c.isPrimitive() & JVMUtils.isConcrete(c)) { + instance = c.newInstance(); + //System.out.printf("Created object of class %s => %s%n", c, instance); + return instance; + } else + return null; + } catch (IllegalAccessException e) { + } catch (InstantiationException e) { + } catch (ExceptionInInitializerError e) { + } catch (SecurityException e) { + } + // this last one is super dangerous, but some of these methods catch ClassNotFoundExceptions + // and rethrow then as RuntimeExceptions + catch (RuntimeException e) { + } + + return instance; + } + + + /** + * Create an instance of the GATK parsing engine, for argument processing with GATKDoclet + * + * @return + */ + private ParsingEngine createStandardGATKParsingEngine() { + CommandLineProgram clp = new CommandLineGATK(); + try { + CommandLineProgram.start(clp, new String[]{}, true); + return clp.parser; + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + /** + * Gets the javadocs associated with field name in classDoc. Throws a + * runtime exception if this proves impossible. + * + * @param classDoc + * @param name + * @return + */ + private FieldDoc getFieldDoc(ClassDoc classDoc, String name) { + return getFieldDoc(classDoc, name, true); + } + + /** + * Recursive helper routine to getFieldDoc() + * + * @param classDoc + * @param name + * @param primary + * @return + */ + private FieldDoc getFieldDoc(ClassDoc classDoc, String name, boolean primary) { + //System.out.printf("Looking for %s in %s%n", name, classDoc.name()); + for (FieldDoc fieldDoc : classDoc.fields(false)) { + //System.out.printf("fieldDoc " + fieldDoc + " name " + fieldDoc.name()); + if (fieldDoc.name().equals(name)) + return fieldDoc; + + Field field = DocletUtils.getFieldForFieldDoc(fieldDoc); + if (field == null) + throw new RuntimeException("Could not find the field corresponding to " + fieldDoc + ", presumably because the field is inaccessible"); + if (field.isAnnotationPresent(ArgumentCollection.class)) { + ClassDoc typeDoc = getRootDoc().classNamed(fieldDoc.type().qualifiedTypeName()); + if (typeDoc == null) + throw new ReviewedStingException("Tried to get javadocs for ArgumentCollection field " + fieldDoc + " but could't find the class in the RootDoc"); + else { + FieldDoc result = getFieldDoc(typeDoc, name, false); + if (result != null) + return result; + // else keep searching + } + } + } + + // if we didn't find it here, wander up to the superclass to find the field + if (classDoc.superclass() != null) { + return getFieldDoc(classDoc.superclass(), name, false); + } + + if (primary) + throw new RuntimeException("No field found for expected field " + name); + else + return null; + } + + /** + * Returns a Pair of (main, synonym) names for argument with fullName s1 and + * shortName s2. + * + * Previously we had it so the main name was selected to be the longest of the two, provided + * it didn't exceed MAX_DISPLAY_NAME, in which case the shorter was taken. But we now disable + * the length-based name rearrangement in order to maintain consistency in the GATKDocs table. + * + * This may cause messed up spacing in the CLI-help display but we don't care as much about that + * since more users use the online GATKDocs for looking up arguments. + * + * @param s1 the short argument name without -, or null if not provided + * @param s2 the long argument name without --, or null if not provided + * @return A pair of fully qualified names (with - or --) for the argument. The first + * element is the primary display name while the second (potentially null) is a + * synonymous name. + */ + Pair displayNames(String s1, String s2) { + s1 = s1 == null ? null : "-" + s1; + s2 = s2 == null ? null : "--" + s2; + + if (s1 == null) return new Pair(s2, null); + if (s2 == null) return new Pair(s1, null); + + return new Pair(s2, s1); + } + + /** + * Returns a human readable string that describes the Type type of a GATK argument. + *

+ * This will include parameterized types, so that Set{T} shows up as Set(T) and not + * just Set in the docs. + * + * @param type + * @return + */ + protected String argumentTypeString(Type type) { + if (type instanceof ParameterizedType) { + ParameterizedType parameterizedType = (ParameterizedType) type; + List subs = new ArrayList(); + for (Type actualType : parameterizedType.getActualTypeArguments()) + subs.add(argumentTypeString(actualType)); + return argumentTypeString(((ParameterizedType) type).getRawType()) + "[" + Utils.join(",", subs) + "]"; + } else if (type instanceof GenericArrayType) { + return argumentTypeString(((GenericArrayType) type).getGenericComponentType()) + "[]"; + } else if (type instanceof WildcardType) { + throw new RuntimeException("We don't support wildcards in arguments: " + type); + } else if (type instanceof Class) { + return ((Class) type).getSimpleName(); + } else { + throw new StingException("Unknown type: " + type); + } + } + + /** + * Helper routine that returns the Feature.class required by a RodBinding, + * either T for RodBinding{T} or List{RodBinding{T}}. Returns null if + * the Type doesn't fit either model. + * + * @param type + * @return + */ + protected Class getFeatureTypeIfPossible(Type type) { + if (type instanceof ParameterizedType) { + ParameterizedType paramType = (ParameterizedType) type; + if (RodBinding.class.isAssignableFrom((Class) paramType.getRawType())) { + return (Class) JVMUtils.getParameterizedTypeClass(type); + } else { + for (Type paramtype : paramType.getActualTypeArguments()) { + Class x = getFeatureTypeIfPossible(paramtype); + if (x != null) + return x; + } + } + } + + return null; + } + + /** + * High-level entry point for creating a FreeMarker map describing the GATK argument + * source with definition def, with associated javadoc fieldDoc. + * + * @param fieldDoc + * @param source + * @param def + * @return a non-null Map binding argument keys with their values + */ + protected Map docForArgument(FieldDoc fieldDoc, ArgumentSource source, ArgumentDefinition def) { + Map root = new HashMap(); + Pair names = displayNames(def.shortName, def.fullName); + + root.put("name", names.getFirst()); + + if (names.getSecond() != null) + root.put("synonyms", names.getSecond()); + + root.put("required", def.required ? "yes" : "no"); + + // type of the field + root.put("type", argumentTypeString(source.field.getGenericType())); + + Class featureClass = getFeatureTypeIfPossible(source.field.getGenericType()); + if (featureClass != null) { + // deal with the allowable types + FeatureManager manager = new FeatureManager(); + List rodTypes = new ArrayList(); + for (FeatureManager.FeatureDescriptor descriptor : manager.getByFeature(featureClass)) { + rodTypes.add(String.format("%s", + GATKDocUtils.htmlFilenameForClass(descriptor.getCodecClass()), + descriptor.getName())); + } + + root.put("rodTypes", Utils.join(", ", rodTypes)); + } + + // summary and fulltext + root.put("summary", def.doc != null ? def.doc : ""); + root.put("fulltext", fieldDoc.commentText()); + + // What are our enum options? + if (def.validOptions != null) + root.put("options", docForEnumArgument(source.field.getType())); + + // general attributes + List attributes = new ArrayList(); + if (def.required) attributes.add("required"); + if (source.isDeprecated()) attributes.add("deprecated"); + if (attributes.size() > 0) + root.put("attributes", Utils.join(", ", attributes)); + + return root; + } + + /** + * Helper routine that provides a FreeMarker map for an enumClass, grabbing the + * values of the enum and their associated javadoc documentation. + * + * @param enumClass + * @return + */ + @Requires("enumClass.isEnum()") + private List> docForEnumArgument(final Class enumClass) { + final ClassDoc doc = this.getDoclet().getClassDocForClass(enumClass); + if ( doc == null ) + throw new RuntimeException("Tried to get docs for enum " + enumClass + " but got null instead"); + + final Set enumConstantFieldNames = enumConstantsNames(enumClass); + + final List> bindings = new ArrayList>(); + for (final FieldDoc fieldDoc : doc.fields(false)) { + if (enumConstantFieldNames.contains(fieldDoc.name()) ) + bindings.add( + new HashMap() {{ + put("name", fieldDoc.name()); + put("summary", fieldDoc.commentText()); + }}); + } + + return bindings; + } + + /** + * Returns the name of the fields that are enum constants according to reflection + * + * @return a non-null set of fields that are enum constants + */ + private Set enumConstantsNames(final Class enumClass) { + final Set enumConstantFieldNames = new HashSet(); + + for ( final Field field : enumClass.getFields() ) { + if ( field.isEnumConstant() ) + enumConstantFieldNames.add(field.getName()); + } + + return enumConstantFieldNames; + } +} diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/HelpConstants.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/HelpConstants.java new file mode 100644 index 000000000..783e7aa90 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/HelpConstants.java @@ -0,0 +1,83 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.help; + +public class HelpConstants { + + public final static String BASE_GATK_URL = "http://www.broadinstitute.org/gatk"; + public final static String GATK_DOCS_URL = BASE_GATK_URL + "/gatkdocs/"; + public final static String GATK_FORUM_URL = "http://gatkforums.broadinstitute.org/"; + public final static String GATK_FORUM_API_URL = "https://gatkforums.broadinstitute.org/api/v1/"; + + /** + * Arguments for parallelism options + */ + public final static String ARG_TREEREDUCIBLE = "-nt"; + public final static String ARG_NANOSCHEDULABLE = "-nct"; + public final static String CMDLINE_GATK_URL = GATK_DOCS_URL + "org_broadinstitute_sting_gatk_CommandLineGATK.html"; + + /** + * Definition of the group names / categories of tools. + * The names get parsed to make supercategories in the doc index, + * so be careful when making big changes -- see GATKDoclet.java toMap() + */ + public final static String DOCS_CAT_DATA = "Sequence Data Processing Tools"; + public final static String DOCS_CAT_QC = "Diagnostics and Quality Control Tools"; + public final static String DOCS_CAT_ENGINE = "Engine Parameters (available to all tools)"; + public final static String DOCS_CAT_RF = "Read Filters"; + public final static String DOCS_CAT_REFUTILS = "Reference Utilities"; + public final static String DOCS_CAT_RODCODECS = "ROD Codecs"; + public final static String DOCS_CAT_USRERR = "User Exceptions (DevZone)"; + public final static String DOCS_CAT_VALIDATION = "Validation Utilities"; + public final static String DOCS_CAT_ANNOT = "Variant Annotations"; + public final static String DOCS_CAT_VARDISC = "Variant Discovery Tools"; + public final static String DOCS_CAT_VARMANIP = "Variant Evaluation and Manipulation Tools"; + public final static String DOCS_CAT_TOY = "Toy Walkers (DevZone)"; + public final static String DOCS_CAT_HELPUTILS = "Help Utilities"; + + public static String forumPost(String post) { + return GATK_FORUM_URL + post; + } + + /** + * Go-to developer name codes for tracking and display purposes. Only current team members should be in this list. + * When someone leaves, their charges should be redistributed. The actual string should be closest to the dev's + * abbreviated name or two/three-letter nickname as possible. The code can be something else if necessary to + * disambiguate from other variable. + */ + public final static String MC = "MC"; // Mauricio Carneiro + public final static String EB = "EB"; // Eric Banks + public final static String RP = "RP"; // Ryan Poplin + public final static String GVDA = "GG"; // Geraldine Van der Auwera + public final static String VRR = "VRR"; // Valentin Ruano-Rubio + public final static String ALM = "ALM"; // Ami Levy-Moonshine + public final static String BH = "BH"; // Bertrand Haas + public final static String JoT = "JT"; // Joel Thibault + public final static String DR = "DR"; // David Roazen + public final static String KS = "KS"; // Khalid Shakir + + +} \ No newline at end of file diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/HelpFormatter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/HelpFormatter.java new file mode 100644 index 000000000..f2e3fad4b --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/HelpFormatter.java @@ -0,0 +1,336 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.help; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.text.TextFormattingUtils; + +import java.net.InetAddress; +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.*; +/** + * Print out help for Sting command-line applications. + */ + +public class HelpFormatter { + /** our log, which we want to capture anything from org.broadinstitute.sting */ + private static Logger logger = Logger.getLogger(HelpFormatter.class); + + public static final int FIELD_SEPARATION_WIDTH = 3; + + /** + * Prints the help, given a collection of argument definitions. + * @param applicationDetails Application details + * @param argumentDefinitions Argument definitions for which help should be printed. + */ + public void printHelp( ApplicationDetails applicationDetails, ArgumentDefinitions argumentDefinitions ) { + List argumentGroups = prepareArgumentGroups( argumentDefinitions ); + + List header = applicationDetails.applicationHeader; + String barrier = createBarrier(header); + + System.out.printf("%s%n",barrier); + for(String headerLine: header) + System.out.printf("%s%n",headerLine); + System.out.printf("%s%n",barrier); + for(String attributionLine: applicationDetails.attribution) + System.out.printf("%s%n",attributionLine); + System.out.printf("%s%n",barrier); + + String synopsis = getSynopsis(applicationDetails.runningInstructions,argumentGroups); + String additionalDetails = applicationDetails.additionalHelp != null ? applicationDetails.additionalHelp : ""; + String detailedDescription = getDetailed(argumentGroups); + + System.out.printf("%s%n%s%n%s%n",synopsis,detailedDescription,additionalDetails ); + } + + /** + * Gets the synopsis: the actual command to run. + * @param runningInstructions Instructions on how to run hte application. + * @param argumentGroups Program arguments sorted in order of definition group displays. + * @return A synopsis line. + */ + private String getSynopsis( String runningInstructions, + List argumentGroups ) { + // Build out the synopsis all as one long line. + StringBuilder lineBuilder = new StringBuilder(); + Formatter lineFormatter = new Formatter( lineBuilder ); + + lineFormatter.format("java %s", runningInstructions); + + for( ArgumentDefinitionGroup argumentGroup: argumentGroups ) { + for( ArgumentDefinition argumentDefinition: argumentGroup.argumentDefinitions ) { + if(argumentDefinition.isHidden) + continue; + lineFormatter.format(" "); + if( !argumentDefinition.required ) lineFormatter.format("["); + if( argumentDefinition.shortName != null ) + lineFormatter.format("-%s", argumentDefinition.shortName); + else + lineFormatter.format("--%s", argumentDefinition.fullName); + if( !argumentDefinition.isFlag ) + lineFormatter.format(" <%s>", argumentDefinition.fullName); + if( !argumentDefinition.required ) lineFormatter.format("]"); + } + } + + // Word wrap the synopsis. + List wrappedSynopsis = TextFormattingUtils.wordWrap( lineBuilder.toString(), TextFormattingUtils.DEFAULT_LINE_WIDTH ); + + String header = "usage: "; + int headerLength = header.length(); + + StringBuilder synopsisBuilder = new StringBuilder(); + Formatter synopsisFormatter = new Formatter(synopsisBuilder); + for( String synopsisLine: wrappedSynopsis ) { + synopsisFormatter.format("%" + headerLength + "s%s%n", header, synopsisLine); + header = ""; + } + + return synopsisBuilder.toString(); + } + + /** + * Gets detailed output about each argument type. + * @param argumentGroups Collection of program arguments sorted according to how they should be shown. + * @return Detailed text about all arguments. + */ + private String getDetailed( List argumentGroups ) { + StringBuilder builder = new StringBuilder(); + + for( ArgumentDefinitionGroup argumentGroup: argumentGroups ) + builder.append( getDetailForGroup( argumentGroup ) ); + + return builder.toString(); + } + + /** + * Gets a detailed description for a given argument group. + * @param argumentDefinitionGroup The group of argument definitions to render. + * @return A string giving detailed info about the contents of this group. + */ + private String getDetailForGroup( ArgumentDefinitionGroup argumentDefinitionGroup ) { + if(argumentDefinitionGroup.allHidden()) + return ""; + + StringBuilder builder = new StringBuilder(); + Formatter formatter = new Formatter( builder ); + + if( argumentDefinitionGroup.groupName != null && argumentDefinitionGroup.argumentDefinitions.size() != 0 ) + builder.append( String.format("%nArguments for %s:%n", argumentDefinitionGroup.groupName ) ); + + List argumentDefinitions = new ArrayList(); + for(ArgumentDefinition argumentDefinition: argumentDefinitionGroup.argumentDefinitions) { + if(!argumentDefinition.isHidden) + argumentDefinitions.add(argumentDefinition); + } + + // Try to fit the entire argument definition across the screen, but impose an arbitrary cap of 3/4 * + // LINE_WIDTH in case the length of the arguments gets out of control. + int argWidth = Math.min( findLongestArgumentCallingInfo(argumentDefinitions), (TextFormattingUtils.DEFAULT_LINE_WIDTH*3)/4 - FIELD_SEPARATION_WIDTH ); + int docWidth = TextFormattingUtils.DEFAULT_LINE_WIDTH - argWidth - FIELD_SEPARATION_WIDTH; + + for( ArgumentDefinition argumentDefinition: argumentDefinitions ) { + Iterator wordWrappedArgs = TextFormattingUtils.wordWrap( getArgumentCallingInfo(argumentDefinition), argWidth ).iterator(); + Iterator wordWrappedDoc = TextFormattingUtils.wordWrap( getArgumentDoc(argumentDefinition), docWidth ).iterator(); + + while( wordWrappedArgs.hasNext() || wordWrappedDoc.hasNext() ) { + String arg = wordWrappedArgs.hasNext() ? wordWrappedArgs.next() : ""; + String doc = wordWrappedDoc.hasNext() ? wordWrappedDoc.next() : ""; + + String formatString = "%-" + argWidth + "s%" + FIELD_SEPARATION_WIDTH + "s%s%n"; + formatter.format( formatString, arg, "", doc ); + } + } + + return builder.toString(); + } + + /** + * Gets a string indicating how this argument should be passed to the application. + * @param argumentDefinition Argument definition for which help should be printed. + * @return Calling information for this argument. + */ + private String getArgumentCallingInfo( ArgumentDefinition argumentDefinition ) { + StringBuilder builder = new StringBuilder(); + Formatter formatter = new Formatter( builder ); + + formatter.format(" "); + if( argumentDefinition.shortName != null ) + formatter.format("-%s,", argumentDefinition.shortName); + formatter.format("--%s", argumentDefinition.fullName); + if( !argumentDefinition.isFlag ) + formatter.format(" <%s>", argumentDefinition.fullName); + + return builder.toString(); + } + + /** + * Gets a string of argument documentation. + * @param argumentDefinition Argument definition for which help should be printed. + * @return Brief description for this argument. + */ + private String getArgumentDoc( ArgumentDefinition argumentDefinition ) { + StringBuilder builder = new StringBuilder(); + builder.append(argumentDefinition.doc); + if( argumentDefinition.validOptions != null ) { + builder.append(" ("); + builder.append(Utils.join("|",argumentDefinition.validOptions)); + builder.append(")"); + } + return builder.toString(); + } + + /** + * Crude implementation which finds the longest argument portion + * given a set of arguments. + * @param argumentDefinitions argument definitions to inspect. + * @return longest argument length. + */ + private int findLongestArgumentCallingInfo( Collection argumentDefinitions ) { + int longest = 0; + for( ArgumentDefinition argumentDefinition: argumentDefinitions ) { + String argumentText = getArgumentCallingInfo( argumentDefinition ); + if( longest < argumentText.length() ) + longest = argumentText.length(); + } + return longest; + } + + /** + * Extract the argument definition groups from the argument definitions and arrange them appropriately. + * For help, we want the arguments sorted as they are declared in the class. However, required arguments + * should appear before optional arguments. + * @param argumentDefinitions Argument definitions from which to extract argument groups. + * @return A list of argument groups sorted in display order. + */ + private List prepareArgumentGroups( ArgumentDefinitions argumentDefinitions ) { + // Sort the list of argument definitions according to how they should be shown. + // Put the sorted results into a new cloned data structure. + Comparator definitionComparator = new Comparator() { + public int compare( ArgumentDefinition lhs, ArgumentDefinition rhs ) { + if( lhs.required && rhs.required ) return 0; + if( lhs.required ) return -1; + if( rhs.required ) return 1; + return 0; + } + }; + + List argumentGroups = new ArrayList(); + for( ArgumentDefinitionGroup argumentGroup: argumentDefinitions.getArgumentDefinitionGroups() ) { + List sortedDefinitions = new ArrayList( argumentGroup.argumentDefinitions ); + Collections.sort( sortedDefinitions, definitionComparator ); + argumentGroups.add( new ArgumentDefinitionGroup(argumentGroup.groupName,sortedDefinitions) ); + } + + // Sort the argument groups themselves with main arguments first, followed by plugins sorted in name order. + Comparator groupComparator = new Comparator() { + public int compare( ArgumentDefinitionGroup lhs, ArgumentDefinitionGroup rhs ) { + if( lhs.groupName == null && rhs.groupName == null ) return 0; + if( lhs.groupName == null ) return -1; + if( rhs.groupName == null ) return 1; + return lhs.groupName.compareTo(rhs.groupName); + } + }; + Collections.sort( argumentGroups, groupComparator ); + + + return argumentGroups; + } + + /** + * generateHeaderInformation + *

+ *

+ * Generate a standard header for the logger + * + * @param applicationDetails details of the application to run. + * @param parsedArgs the arguments passed in + */ + public static void generateHeaderInformation(ApplicationDetails applicationDetails, Map parsedArgs) { + + DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss"); + java.util.Date date = new java.util.Date(); + + String barrier = createBarrier(applicationDetails.applicationHeader); + + logger.info(barrier); + for (String headerLine : applicationDetails.applicationHeader) + logger.info(headerLine); + logger.debug("Current directory: " + System.getProperty("user.dir")); + for (Map.Entry entry: parsedArgs.entrySet()) { + ArgumentMatchSource matchSource = entry.getKey(); + final String sourceName; + switch (matchSource.getType()) { + case CommandLine: sourceName = "Program"; break; + case Provider: sourceName = matchSource.getDescription(); break; + default: throw new RuntimeException("Unexpected argument match source type: " + matchSource.getType()); + } + + String output = sourceName + " Args: " + entry.getValue().getDescription(); + logger.info(output); + } + logger.info(generateUserHelpData()); + logger.info("Date/Time: " + dateFormat.format(date)); + logger.info(barrier); + + for(String attribution: applicationDetails.attribution) + logger.info(attribution); + logger.info(barrier); + } + + /** + * Create the user-related help information. + * @return a non-null, non-empty String with the relevant information. + */ + private static String generateUserHelpData() { + try { + return "Executing as " + + System.getProperty("user.name") + "@" + InetAddress.getLocalHost().getHostName() + + " on " + System.getProperty("os.name") + " " + System.getProperty("os.version") + + " " + System.getProperty("os.arch") + "; " + System.getProperty("java.vm.name") + + " " + System.getProperty("java.runtime.version") + "."; + } catch (Exception e) { + // don't fail + return ""; + } + } + + /** + * Create a barrier to use to distinguish the header from the rest of the output. + * @param text A collection of lines to output as part of a header. + * @return A barrier consisting of the '-' character. + */ + private static String createBarrier(List text) { + int barrierWidth = 0; + for(String headerLine: text) + barrierWidth = Math.max(headerLine.length(),barrierWidth); + return String.format("%0" + barrierWidth + "d",0).replace('0','-'); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/HelpUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/HelpUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/help/ResourceBundleExtractorDoclet.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/ResourceBundleExtractorDoclet.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/help/ResourceBundleExtractorDoclet.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/help/ResourceBundleExtractorDoclet.java diff --git a/public/java/src/org/broadinstitute/sting/utils/instrumentation/Sizeof.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/instrumentation/Sizeof.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/instrumentation/Sizeof.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/instrumentation/Sizeof.java diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalMergingRule.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/interval/IntervalMergingRule.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/interval/IntervalMergingRule.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/interval/IntervalMergingRule.java diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalSetRule.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/interval/IntervalSetRule.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/interval/IntervalSetRule.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/interval/IntervalSetRule.java diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/interval/IntervalUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/interval/IntervalUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/io/FileExtension.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/io/FileExtension.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/io/FileExtension.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/io/FileExtension.java diff --git a/public/java/src/org/broadinstitute/sting/utils/io/HardThresholdingOutputStream.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/io/HardThresholdingOutputStream.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/io/HardThresholdingOutputStream.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/io/HardThresholdingOutputStream.java diff --git a/public/java/src/org/broadinstitute/sting/utils/io/IOUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/io/IOUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/io/IOUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/io/IOUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/io/Resource.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/io/Resource.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/io/Resource.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/io/Resource.java diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachine.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachine.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachine.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachine.java diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/LIBSDownsamplingInfo.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/locusiterator/LIBSDownsamplingInfo.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/locusiterator/LIBSDownsamplingInfo.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/locusiterator/LIBSDownsamplingInfo.java diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/LIBSPerformance.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/locusiterator/LIBSPerformance.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/locusiterator/LIBSPerformance.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/locusiterator/LIBSPerformance.java diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/locusiterator/LocusIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/locusiterator/LocusIterator.java diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManager.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManager.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManager.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManager.java diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/SamplePartitioner.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/locusiterator/SamplePartitioner.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/locusiterator/SamplePartitioner.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/locusiterator/SamplePartitioner.java diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/EOFMarkedValue.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/nanoScheduler/EOFMarkedValue.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/nanoScheduler/EOFMarkedValue.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/nanoScheduler/EOFMarkedValue.java diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResultsQueue.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/nanoScheduler/MapResultsQueue.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResultsQueue.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/nanoScheduler/MapResultsQueue.java diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSMapFunction.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/nanoScheduler/NSMapFunction.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSMapFunction.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/nanoScheduler/NSMapFunction.java diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSProgressFunction.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/nanoScheduler/NSProgressFunction.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSProgressFunction.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/nanoScheduler/NSProgressFunction.java diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSReduceFunction.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/nanoScheduler/NSReduceFunction.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSReduceFunction.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/nanoScheduler/NSReduceFunction.java diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/Reducer.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/nanoScheduler/Reducer.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/nanoScheduler/Reducer.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/nanoScheduler/Reducer.java diff --git a/public/java/src/org/broadinstitute/sting/utils/package-info.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/package-info.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/package-info.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/package-info.java diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/BatchPairHMM.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pairhmm/BatchPairHMM.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/pairhmm/BatchPairHMM.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pairhmm/BatchPairHMM.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java new file mode 100644 index 000000000..0ee08e560 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java @@ -0,0 +1,220 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.pairhmm; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; + +import java.util.Arrays; + +import static java.lang.Math.log10; +import static org.broadinstitute.sting.utils.pairhmm.PairHMMModel.*; + +/** + * Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book. + * + * User: rpoplin, carneiro + * Date: 3/1/12 + */ +public class Log10PairHMM extends N2MemoryPairHMM { + /** + * Should we use exact log10 calculation (true), or an approximation (false)? + */ + private final boolean doExactLog10; + + + // we divide e by 3 because the observed base could have come from any of the non-observed alleles + protected final static double log10_3 = log10(3.0); + + /** + * Create an uninitialized PairHMM + * + * @param doExactLog10 should the log10 calculations be exact (slow) or approximate (faster) + */ + public Log10PairHMM(final boolean doExactLog10) { + this.doExactLog10 = doExactLog10; + } + + /** + * Is this HMM using exact log10 calculations? + * @return true if exact, false if approximate + */ + public boolean isDoingExactLog10Calculations() { + return doExactLog10; + } + + /** + * {@inheritDoc} + */ + @Override + public void initialize(final int readMaxLength, final int haplotypeMaxLength ) { + super.initialize(readMaxLength, haplotypeMaxLength); + + for( int iii=0; iii < paddedMaxReadLength; iii++ ) { + Arrays.fill(matchMatrix[iii], Double.NEGATIVE_INFINITY); + Arrays.fill(insertionMatrix[iii], Double.NEGATIVE_INFINITY); + Arrays.fill(deletionMatrix[iii], Double.NEGATIVE_INFINITY); + } + } + + /** + * {@inheritDoc} + */ + @Override + public double subComputeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final int hapStartIndex, + final boolean recacheReadValues, + final int nextHapStartIndex) { + + + if ( ! constantsAreInitialized || recacheReadValues ) + initializeProbabilities(insertionGOP, deletionGOP, overallGCP); + initializePriors(haplotypeBases, readBases, readQuals, hapStartIndex); + if (previousHaplotypeBases == null || previousHaplotypeBases.length != haplotypeBases.length) { + // set the initial value (free deletions in the beginning) for the first row in the deletion matrix + initializeMatrixValues(haplotypeBases); + } + + for (int i = 1; i < paddedReadLength; i++) { + // +1 here is because hapStartIndex is 0-based, but our matrices are 1 based + for (int j = hapStartIndex+1; j < paddedHaplotypeLength; j++) { + updateCell(i, j, prior[i][j], transition[i]); + } + } + + // final probability is the log10 sum of the last element in the Match and Insertion state arrays + // this way we ignore all paths that ended in deletions! (huge) + // but we have to sum all the paths ending in the M and I matrices, because they're no longer extended. + return finalLikelihoodCalculation(); + } + + protected void initializeMatrixValues(final byte[] haplotypeBases) { + final double initialValue = Math.log10(1.0 / haplotypeBases.length); + for( int j = 0; j < paddedHaplotypeLength; j++ ) { + deletionMatrix[0][j] = initialValue; + } + } + + protected double finalLikelihoodCalculation() { + final int endI = paddedReadLength - 1; + double finalSumProbabilities = myLog10SumLog10(new double[]{matchMatrix[endI][1], insertionMatrix[endI][1]}); + for (int j = 2; j < paddedHaplotypeLength; j++) + finalSumProbabilities = myLog10SumLog10(new double[]{finalSumProbabilities, matchMatrix[endI][j], insertionMatrix[endI][j]}); + return finalSumProbabilities; + } + + + /** + * Initializes the matrix that holds all the constants related to the editing + * distance between the read and the haplotype. + * + * @param haplotypeBases the bases of the haplotype + * @param readBases the bases of the read + * @param readQuals the base quality scores of the read + * @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read) + */ + public void initializePriors(final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, final int startIndex) { + + // initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases + // Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2. + + for (int i = 0; i < readBases.length; i++) { + final byte x = readBases[i]; + final byte qual = readQuals[i]; + for (int j = startIndex; j < haplotypeBases.length; j++) { + final byte y = haplotypeBases[j]; + prior[i+1][j+1] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? + QualityUtils.qualToProbLog10(qual) : (QualityUtils.qualToErrorProbLog10(qual) - (doNotUseTristateCorrection ? 0.0 : log10_3)) ); + } + } + } + + /** + * Initializes the matrix that holds all the constants related to quality scores. + * + * @param insertionGOP insertion quality scores of the read + * @param deletionGOP deletion quality scores of the read + * @param overallGCP overall gap continuation penalty + */ + @Requires({ + "insertionGOP != null", + "deletionGOP != null", + "overallGCP != null" + }) + @Ensures("constantsAreInitialized") + protected void initializeProbabilities(final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP) { + PairHMMModel.qualToTransProbsLog10(transition,insertionGOP,deletionGOP,overallGCP); + // note that we initialized the constants + constantsAreInitialized = true; + } + + + /** + * Compute the log10SumLog10 of the values + * + * NOTE NOTE NOTE + * + * Log10PairHMM depends critically on this function tolerating values that are all -Infinity + * and the sum returning -Infinity. Note good. Needs to be fixed. + * + * NOTE NOTE NOTE + * + * @param values an array of log10 probabilities that need to be summed + * @return the log10 of the sum of the probabilities + */ + @Requires("values != null") + protected double myLog10SumLog10(final double[] values) { + return doExactLog10 ? MathUtils.log10sumLog10(values) : MathUtils.approximateLog10SumLog10(values); + } + + /** + * Updates a cell in the HMM matrix + * + * The read and haplotype indices are offset by one because the state arrays have an extra column to hold the + * initial conditions + + * @param indI row index in the matrices to update + * @param indJ column index in the matrices to update + * @param prior the likelihood editing distance matrix for the read x haplotype + * @param transition an array with the six transition relevant to this location + */ + protected void updateCell( final int indI, final int indJ, final double prior, final double[] transition) { + + matchMatrix[indI][indJ] = prior + + myLog10SumLog10(new double[]{matchMatrix[indI - 1][indJ - 1] + transition[matchToMatch], + insertionMatrix[indI - 1][indJ - 1] + transition[indelToMatch], + deletionMatrix[indI - 1][indJ - 1] + transition[indelToMatch]}); + insertionMatrix[indI][indJ] = myLog10SumLog10(new double[] {matchMatrix[indI - 1][indJ] + transition[matchToInsertion], insertionMatrix[indI - 1][indJ] + transition[insertionToInsertion]}); + deletionMatrix[indI][indJ] = myLog10SumLog10(new double[] {matchMatrix[indI][indJ - 1] + transition[matchToDeletion], deletionMatrix[indI][indJ - 1] + transition[deletionToDeletion]}); + } +} diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pairhmm/N2MemoryPairHMM.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pairhmm/N2MemoryPairHMM.java new file mode 100644 index 000000000..057c67a55 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pairhmm/N2MemoryPairHMM.java @@ -0,0 +1,97 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.pairhmm; + +import com.google.java.contract.Requires; + +/** + * Superclass for PairHMM that want to use a full read x haplotype matrix for their match, insertion, and deletion matrix + * + * User: rpoplin + * Date: 10/16/12 + */ +abstract class N2MemoryPairHMM extends PairHMM { + protected double[][] transition = null; // The transition probabilities cache + protected double[][] prior = null; // The prior probabilities cache + protected double[][] matchMatrix = null; + protected double[][] insertionMatrix = null; + protected double[][] deletionMatrix = null; + + // only used for debugging purposes + protected boolean doNotUseTristateCorrection = false; + + public void doNotUseTristateCorrection() { + doNotUseTristateCorrection = true; + } + + /** + * Initialize this PairHMM, making it suitable to run against a read and haplotype with given lengths + * + * Note: Do not worry about padding, just provide the true max length of the read and haplotype. The HMM will take care of the padding. + * + * @param haplotypeMaxLength the max length of haplotypes we want to use with this PairHMM + * @param readMaxLength the max length of reads we want to use with this PairHMM + */ + public void initialize( final int readMaxLength, final int haplotypeMaxLength ) { + super.initialize(readMaxLength, haplotypeMaxLength); + + matchMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; + insertionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; + deletionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; + + transition = PairHMMModel.createTransitionMatrix(maxReadLength); + prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; + } + + /** + * Print out the core hmm matrices for debugging + */ + protected void dumpMatrices() { + dumpMatrix("matchMetricArray", matchMatrix); + dumpMatrix("insertionMatrix", insertionMatrix); + dumpMatrix("deletionMatrix", deletionMatrix); + } + + /** + * Print out in a human readable form the matrix for debugging + * @param name the name of this matrix + * @param matrix the matrix of values + */ + @Requires({"name != null", "matrix != null"}) + private void dumpMatrix(final String name, final double[][] matrix) { + System.out.printf("%s%n", name); + for ( int i = 0; i < matrix.length; i++) { + System.out.printf("\t%s[%d]", name, i); + for ( int j = 0; j < matrix[i].length; j++ ) { + if ( Double.isInfinite(matrix[i][j]) ) + System.out.printf(" %15s", String.format("%f", matrix[i][j])); + else + System.out.printf(" % 15.5e", matrix[i][j]); + } + System.out.println(); + } + } +} diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pairhmm/PairHMM.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pairhmm/PairHMM.java new file mode 100644 index 000000000..4203677a1 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pairhmm/PairHMM.java @@ -0,0 +1,329 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.pairhmm; + +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.variant.variantcontext.Allele; + +import java.util.Arrays; +import java.util.List; +import java.util.Map; +/** + * Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book. + * + * User: rpoplin + * Date: 10/16/12 + */ +public abstract class PairHMM { + protected final static Logger logger = Logger.getLogger(PairHMM.class); + + protected boolean constantsAreInitialized = false; + + protected byte[] previousHaplotypeBases; + protected int hapStartIndex; + + public enum HMM_IMPLEMENTATION { + /* Very slow implementation which uses very accurate log10 sum functions. Only meant to be used as a reference test implementation */ + EXACT, + /* PairHMM as implemented for the UnifiedGenotyper. Uses log10 sum functions accurate to only 1E-4 */ + ORIGINAL, + /* Optimized version of the PairHMM which caches per-read computations and operations in real space to avoid costly sums of log10'ed likelihoods */ + LOGLESS_CACHING, + /* Optimized AVX implementation of LOGLESS_CACHING called through JNI */ + VECTOR_LOGLESS_CACHING, + /* Debugging for vector implementation of LOGLESS_CACHING */ + DEBUG_VECTOR_LOGLESS_CACHING, + /* Logless caching PairHMM that stores computations in 1D arrays instead of matrices, and which proceeds diagonally over the (read x haplotype) intersection matrix */ + ARRAY_LOGLESS + } + + protected int maxHaplotypeLength, maxReadLength; + protected int paddedMaxReadLength, paddedMaxHaplotypeLength; + protected int paddedReadLength, paddedHaplotypeLength; + protected boolean initialized = false; + + // only used for debugging purposes + protected boolean doNotUseTristateCorrection = false; + protected void doNotUseTristateCorrection() { doNotUseTristateCorrection = true; } + + //debug array + protected double[] mLikelihoodArray; + + //profiling information + protected static final boolean doProfiling = true; + protected long computeTime = 0; + protected long startTime = 0; + + /** + * Initialize this PairHMM, making it suitable to run against a read and haplotype with given lengths + * + * Note: Do not worry about padding, just provide the true max length of the read and haplotype. The HMM will take care of the padding. + * + * @param haplotypeMaxLength the max length of haplotypes we want to use with this PairHMM + * @param readMaxLength the max length of reads we want to use with this PairHMM + */ + public void initialize( final int readMaxLength, final int haplotypeMaxLength ) { + if ( readMaxLength <= 0 ) throw new IllegalArgumentException("READ_MAX_LENGTH must be > 0 but got " + readMaxLength); + if ( haplotypeMaxLength <= 0 ) throw new IllegalArgumentException("HAPLOTYPE_MAX_LENGTH must be > 0 but got " + haplotypeMaxLength); + + maxHaplotypeLength = haplotypeMaxLength; + maxReadLength = readMaxLength; + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + paddedMaxReadLength = readMaxLength + 1; + paddedMaxHaplotypeLength = haplotypeMaxLength + 1; + + previousHaplotypeBases = null; + + constantsAreInitialized = false; + initialized = true; + } + + /** + * Called at the end of PairHMM for a region - mostly used by the JNI implementations + */ + public void finalizeRegion() + { + ; + } + + /** + * Initialize this PairHMM, making it suitable to run against a read and haplotype with given lengths + * This function is used by the JNI implementations to transfer all data once to the native code + * @param haplotypes the list of haplotypes + * @param perSampleReadList map from sample name to list of reads + * @param haplotypeMaxLength the max length of haplotypes we want to use with this PairHMM + * @param readMaxLength the max length of reads we want to use with this PairHMM + */ + public void initialize( final List haplotypes, final Map> perSampleReadList, final int readMaxLength, final int haplotypeMaxLength ) { + initialize(readMaxLength, haplotypeMaxLength); + } + + protected int findMaxReadLength(final List reads) { + int listMaxReadLength = 0; + for(GATKSAMRecord read : reads){ + final int readLength = read.getReadLength(); + if( readLength > listMaxReadLength ) { listMaxReadLength = readLength; } + } + return listMaxReadLength; + } + + protected int findMaxHaplotypeLength(final Map haplotypeMap) { + int listMaxHaplotypeLength = 0; + for( final Allele a: haplotypeMap.keySet() ) { + final Haplotype h = haplotypeMap.get(a); + final int haplotypeLength = h.getBases().length; + if( haplotypeLength > listMaxHaplotypeLength ) { listMaxHaplotypeLength = haplotypeLength; } + } + return listMaxHaplotypeLength; + } + + /** + * Given a list of reads and haplotypes, for every read compute the total probability of said read arising from + * each haplotype given base substitution, insertion, and deletion probabilities. + * + * @param reads the list of reads + * @param alleleHaplotypeMap the list of haplotypes + * @param GCPArrayMap Each read is associated with an array containing the gap continuation penalties for use in the model. Length of each GCP-array must match that of its read. + * @return a PerReadAlleleLikelihoodMap containing each read, haplotype-allele, and the log10 probability of + * said read coming from the said haplotype under the provided error model + */ + public PerReadAlleleLikelihoodMap computeLikelihoods(final List reads, final Map alleleHaplotypeMap, final Map GCPArrayMap) { + if(doProfiling) + startTime = System.nanoTime(); + + // (re)initialize the pairHMM only if necessary + final int readMaxLength = findMaxReadLength(reads); + final int haplotypeMaxLength = findMaxHaplotypeLength(alleleHaplotypeMap); + if (!initialized || readMaxLength > maxReadLength || haplotypeMaxLength > maxHaplotypeLength) { initialize(readMaxLength, haplotypeMaxLength); } + + final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap(); + mLikelihoodArray = new double[reads.size()*alleleHaplotypeMap.size()]; + int idx = 0; + for(GATKSAMRecord read : reads){ + final byte[] readBases = read.getReadBases(); + final byte[] readQuals = read.getBaseQualities(); + final byte[] readInsQuals = read.getBaseInsertionQualities(); + final byte[] readDelQuals = read.getBaseDeletionQualities(); + final byte[] overallGCP = GCPArrayMap.get(read); + + // peak at the next haplotype in the list (necessary to get nextHaplotypeBases, which is required for caching in the array implementation) + byte[] currentHaplotypeBases = null; + boolean isFirstHaplotype = true; + Allele currentAllele = null; + double log10l; + //for (final Allele allele : alleleHaplotypeMap.keySet()){ + for (Map.Entry currEntry : alleleHaplotypeMap.entrySet()){ + //final Haplotype haplotype = alleleHaplotypeMap.get(allele); + final Allele allele = currEntry.getKey(); + final Haplotype haplotype = currEntry.getValue(); + final byte[] nextHaplotypeBases = haplotype.getBases(); + if (currentHaplotypeBases != null) { + log10l = computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases, + readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, nextHaplotypeBases); + mLikelihoodArray[idx++] = log10l; + likelihoodMap.add(read, currentAllele, log10l); + } + // update the current haplotype + currentHaplotypeBases = nextHaplotypeBases; + currentAllele = allele; + } + // process the final haplotype + if (currentHaplotypeBases != null) { + + // there is no next haplotype, so pass null for nextHaplotypeBases. + log10l = computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases, + readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, null); + likelihoodMap.add(read, currentAllele, log10l); + mLikelihoodArray[idx++] = log10l; + } + } + if(doProfiling) + computeTime += (System.nanoTime() - startTime); + return likelihoodMap; + } + + /** + * Compute the total probability of read arising from haplotypeBases given base substitution, insertion, and deletion + * probabilities. + * + * Note on using hapStartIndex. This allows you to compute the exact true likelihood of a full haplotypes + * given a read, assuming that the previous calculation read over a full haplotype, recaching the read values, + * starting only at the place where the new haplotype bases and the previous haplotype bases different. This + * index is 0-based, and can be computed with findFirstPositionWhereHaplotypesDiffer given the two haplotypes. + * Note that this assumes that the read and all associated quals values are the same. + * + * @param haplotypeBases the full sequence (in standard SAM encoding) of the haplotype, must be >= than read bases in length + * @param readBases the bases (in standard encoding) of the read, must be <= haplotype bases in length + * @param readQuals the phred-scaled per base substitution quality scores of read. Must be the same length as readBases + * @param insertionGOP the phred-scaled per base insertion quality scores of read. Must be the same length as readBases + * @param deletionGOP the phred-scaled per base deletion quality scores of read. Must be the same length as readBases + * @param overallGCP the phred-scaled gap continuation penalties scores of read. Must be the same length as readBases + * @param recacheReadValues if false, we don't recalculate any cached results, assuming that readBases and its associated + * parameters are the same, and only the haplotype bases are changing underneath us + * @return the log10 probability of read coming from the haplotype under the provided error model + */ + protected final double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final boolean recacheReadValues, + final byte[] nextHaploytpeBases) { + + if ( ! initialized ) throw new IllegalStateException("Must call initialize before calling computeReadLikelihoodGivenHaplotypeLog10"); + if ( haplotypeBases == null ) throw new IllegalArgumentException("haplotypeBases cannot be null"); + if ( haplotypeBases.length > maxHaplotypeLength ) throw new IllegalArgumentException("Haplotype bases is too long, got " + haplotypeBases.length + " but max is " + maxHaplotypeLength); + if ( readBases == null ) throw new IllegalArgumentException("readBases cannot be null"); + if ( readBases.length > maxReadLength ) throw new IllegalArgumentException("readBases is too long, got " + readBases.length + " but max is " + maxReadLength); + if ( readQuals.length != readBases.length ) throw new IllegalArgumentException("Read bases and read quals aren't the same size: " + readBases.length + " vs " + readQuals.length); + if ( insertionGOP.length != readBases.length ) throw new IllegalArgumentException("Read bases and read insertion quals aren't the same size: " + readBases.length + " vs " + insertionGOP.length); + if ( deletionGOP.length != readBases.length ) throw new IllegalArgumentException("Read bases and read deletion quals aren't the same size: " + readBases.length + " vs " + deletionGOP.length); + if ( overallGCP.length != readBases.length ) throw new IllegalArgumentException("Read bases and overall GCP aren't the same size: " + readBases.length + " vs " + overallGCP.length); + + paddedReadLength = readBases.length + 1; + paddedHaplotypeLength = haplotypeBases.length + 1; + + hapStartIndex = (recacheReadValues) ? 0 : hapStartIndex; + + // Pre-compute the difference between the current haplotype and the next one to be run + // Looking ahead is necessary for the ArrayLoglessPairHMM implementation + final int nextHapStartIndex = (nextHaploytpeBases == null || haplotypeBases.length != nextHaploytpeBases.length) ? 0 : findFirstPositionWhereHaplotypesDiffer(haplotypeBases, nextHaploytpeBases); + + double result = subComputeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, hapStartIndex, recacheReadValues, nextHapStartIndex); + + if ( result > 0.0) + throw new IllegalStateException("PairHMM Log Probability cannot be greater than 0: " + String.format("haplotype: %s, read: %s, result: %f, PairHMM: %s", new String(haplotypeBases), new String(readBases), result, this.getClass().getSimpleName())); + else if (!MathUtils.goodLog10Probability(result)) + throw new IllegalStateException("Invalid Log Probability: " + result); + + // Warning: Careful if using the PairHMM in parallel! (this update has to be taken care of). + // Warning: This assumes no downstream modification of the haplotype bases (saves us from copying the array). It is okay for the haplotype caller and the Unified Genotyper. + previousHaplotypeBases = haplotypeBases; + + // For the next iteration, the hapStartIndex for the next haploytpe becomes the index for the current haplotype + // The array implementation has to look ahead to the next haplotype to store caching info. It cannot do this if nextHapStart is before hapStart + hapStartIndex = (nextHapStartIndex < hapStartIndex) ? 0: nextHapStartIndex; + + return result; + } + + /** + * To be overloaded by subclasses to actually do calculation for #computeReadLikelihoodGivenHaplotypeLog10 + */ + @Requires({"readBases.length == readQuals.length", "readBases.length == insertionGOP.length", "readBases.length == deletionGOP.length", + "readBases.length == overallGCP.length", "matchMatrix!=null", "insertionMatrix!=null", "deletionMatrix!=null"}) + protected abstract double subComputeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final int hapStartIndex, + final boolean recacheReadValues, + final int nextHapStartIndex); + + /** + * Compute the first position at which two haplotypes differ + * + * If the haplotypes are exact copies of each other, returns the min length of the two haplotypes. + * + * @param haplotype1 the first haplotype1 + * @param haplotype2 the second haplotype1 + * @return the index of the first position in haplotype1 and haplotype2 where the byte isn't the same + */ + public static int findFirstPositionWhereHaplotypesDiffer(final byte[] haplotype1, final byte[] haplotype2) { + if ( haplotype1 == null || haplotype1.length == 0 ) throw new IllegalArgumentException("Haplotype1 is bad " + Arrays.toString(haplotype1)); + if ( haplotype2 == null || haplotype2.length == 0 ) throw new IllegalArgumentException("Haplotype2 is bad " + Arrays.toString(haplotype2)); + + for( int iii = 0; iii < haplotype1.length && iii < haplotype2.length; iii++ ) { + if( haplotype1[iii] != haplotype2[iii] ) { + return iii; + } + } + + return Math.min(haplotype1.length, haplotype2.length); + } + + /** + * Return the results of the computeLikelihoods function + */ + public double[] getLikelihoodArray() { return mLikelihoodArray; } + /** + * Called at the end of the program to close files, print profiling information etc + */ + public void close() + { + if(doProfiling) + System.out.println("Total compute time in PairHMM computeLikelihoods() : "+(computeTime*1e-9)); + } +} diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pairhmm/PairHMMModel.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pairhmm/PairHMMModel.java new file mode 100644 index 000000000..551be676a --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pairhmm/PairHMMModel.java @@ -0,0 +1,435 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.pairhmm; + +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; + +/** + * Helper class that implement calculations required to implement the PairHMM Finite State Automation (FSA) model. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class PairHMMModel { + + + /** + * Prevents instantiation of this class + */ + private PairHMMModel() { + + } + + /** + * Length of the standard transition probability array. + */ + public static final int TRANS_PROB_ARRAY_LENGTH = 6; + + /** + * Position in the transition probability array for the Match-to-Match transition. + */ + public static final int matchToMatch = 0; + + /** + * Position in the transition probability array for the Indel-to-Match transition. + */ + public static final int indelToMatch = 1; + + /** + * Position in the transition probability array for the Match-to-Insertion transition. + */ + public static final int matchToInsertion = 2; + + /** + * Position in the transition probability array for the Insertion-to-Insertion transition. + */ + public static final int insertionToInsertion = 3; + + /** + * Position in the transition probability array for the Match-to-Deletion transition. + */ + public static final int matchToDeletion = 4; + + /** + * Position in the transition probability array for the Deletion-to-Deletion transition. + */ + public static final int deletionToDeletion = 5; + + /** + * Convenient ln10 constant. + */ + private static double LN10 = Math.log(10); + + /** + * Convenient (ln10)^-1 constant. + */ + private static double INV_LN10 = 1.0 / LN10; + + /** + * Holds pre-calculated the matchToMath probability values in linear scale. + * + *

+ * This is a triangular matrix stored in a unidimentional array like so: + *

+ * (0,0), (0,1), (1,1), (0,2), (1,2), (2,2), (0,3) ... ({@link QualityUtils#MAX_QUAL},{@link QualityUtils#MAX_QUAL}) + */ + private static double[] matchToMatchProb = new double[((QualityUtils.MAX_QUAL + 1) * (QualityUtils.MAX_QUAL + 2)) >> 1]; + + /** + * Holds pre-calculated the matchToMath probability values in log10 scale. + * + *

+ * This is a triangular matrix stored in a unidimentional array like so: + *

+ * (0,0), (0,1), (1,1), (0,2), (1,2), (2,2), (0,3) ... ({@link QualityUtils#MAX_QUAL},{@link QualityUtils#MAX_QUAL}) + */ + private static double[] matchToMatchLog10 = new double[((QualityUtils.MAX_QUAL + 1) * (QualityUtils.MAX_QUAL + 2)) >> 1]; + + /** + * Initialize matchToMatch cache tables {@link #matchToMatch} and {@link #matchToMatchLog10} + */ + static { + for (int i = 0, offset = 0; i <= QualityUtils.MAX_QUAL; offset += ++i) + for (int j = 0; j <= i; j++) { + final double log10Sum = MathUtils.approximateLog10SumLog10(-0.1 * i,-0.1 * j); + matchToMatchLog10[offset + j] = + Math.log1p( - Math.min(1,Math.pow(10,log10Sum))) * INV_LN10; + matchToMatchProb[offset + j] = Math.pow(10,matchToMatchLog10[offset + j]); + } + } + + /** + * Fills a transition probability array given the different quality scores affecting a read site + * + * @param insQual the insertion quality score as a byte. + * @param delQual the deletion quality score as a byte. + * @param gcp the gap-continuation-penalty score as a byte. + * + * @throws NullPointerException if {@code dest} is {@code null}. + * @throws ArrayIndexOutOfBoundsException if {@code dest} is not large enough. + * @throws IllegalArgumentException if {@code insQual}, {@code delQual} or {@code gcp} is less than negative. + */ + public static void qualToTransProbs(final double[] dest, final byte insQual, final byte delQual, final byte gcp) { + if (insQual < 0) throw new IllegalArgumentException("insert quality cannot less than 0: " + insQual); + if (delQual < 0) throw new IllegalArgumentException("deletion quality cannot be less than 0: " + delQual); + if (gcp < 0) throw new IllegalArgumentException("gcp cannot be less than 0: " + gcp); + dest[matchToMatch] = matchToMatchProb(insQual, delQual); + dest[matchToInsertion] = QualityUtils.qualToErrorProb(insQual); + dest[matchToDeletion] = QualityUtils.qualToErrorProb(delQual); + dest[indelToMatch] = QualityUtils.qualToProb(gcp); + dest[insertionToInsertion] = dest[deletionToDeletion] = QualityUtils.qualToErrorProb(gcp); + } + + /** + * Returns a transition probability array given the different quality scores affecting a read site. + * + * @param insQual the insertion quality score as a byte. + * @param delQual the deletion quality score as a byte. + * @param gcp the gap-continuation-penalty score as a byte. + * + * @throws NullPointerException if {@code dest} is {@code null}. + * @throws ArrayIndexOutOfBoundsException if {@code dest} is not large enough. + * @throws IllegalArgumentException if {@code insQual}, {@code delQual} or {@code gcp} is less than negative. + * + * @return never {@code null}. An array of length {@link #TRANS_PROB_ARRAY_LENGTH}. + */ + @SuppressWarnings("unused") + public static double[] qualToTransProbs(final byte insQual, final byte delQual, final byte gcp) { + final double[] dest = new double[TRANS_PROB_ARRAY_LENGTH]; + qualToTransProbs(dest,insQual,delQual,gcp); + return dest; + } + + /** + * Fills ax matrix with the transition probabilities for a number of bases. + * + *

+ * The first dimension of the matrix correspond to the different bases where the first one is stored in position 1. + * Thus the position 0 is left empty and the length of the resulting matrix is actually {@code insQual.length + 1}. + *

+ * Each entry is the transition probability array for that base with a length of {@link #TRANS_PROB_ARRAY_LENGTH}. + * + * @param dest the matrix to update + * @param insQuals insertion qualities. + * @param delQuals deletion qualities. + * @param gcps gap-continuation penalty qualities. + * + * @throws NullPointerException if any of the input arrays, matrices is {@code null} or any entry in {@code dest} is {@code null}. + * @throws IllegalArgumentException if {@code IllegalArgumentException} + * if the input array don't have the same length. + * @throws ArrayIndexOutOfBoundsException if {@code dest} or any of its elements is not large enough to contain the + * transition matrix. + */ + @SuppressWarnings("unused") + public static void qualToTransProbs(final double[][] dest, final byte[] insQuals, final byte[] delQuals, final byte[] gcps) { + final int readLength = insQuals.length; + if (delQuals.length != readLength) throw new IllegalArgumentException("deletion quality array length does not match insert quality array length: " + readLength + " != " + delQuals.length); + if (gcps.length != readLength) throw new IllegalArgumentException("deletion quality array length does not match insert quality array length: " + readLength + " != " + gcps.length); + + if (dest.length < readLength + 1) throw new IllegalArgumentException("destination length is not enough for the read length: " + dest.length + " < " + readLength + " + 1"); + + for (int i = 0; i < readLength; i++) + qualToTransProbs(dest[i + 1], insQuals[i], delQuals[i], gcps[i]); + } + + /** + * Returns a matrix with the transition probabilities for a number of bases. + * + *

+ * The first dimension of the matrix correspond to the different bases where the first one is stored in position 1. + * Thus the position 0 is left empty and the length of the resulting matrix is actually {@code insQual.length + 1}. + *

+ * Each entry is the transition probability array for that base with a length of {@link #TRANS_PROB_ARRAY_LENGTH}. + * + * @param insQuals insertion qualities. + * @param delQuals deletion qualities. + * @param gcps gap-continuation penalty qualities. + * + * @throws NullPointerException if any of the input arrays is {@code null}. + * @throws IllegalArgumentException if {@code IllegalArgumentException} + * if the input array don't have the same length. + * + * @return never {@code null}, an matrix of the dimensions explained above. + */ + @SuppressWarnings("unused") + public static double[][] qualToTransProbs(final byte[] insQuals, final byte[] delQuals, final byte[] gcps) { + final double[][] dest = createTransitionMatrix(insQuals.length); + qualToTransProbs(dest,insQuals,delQuals,gcps); + return dest; + } + + /** + * Fills a transition log10 probability array given the different quality scores affecting a read site. + * + * @param insQual the insertion quality score as a byte. + * @param delQual the deletion quality score as a byte. + * @param gcp the gap-continuation-penalty score as a byte. + * + * @throws NullPointerException if {@code dest} is {@code null}. + * @throws ArrayIndexOutOfBoundsException if {@code dest} is not large enough. + * @throws IllegalArgumentException if {@code insQual}, {@code delQual} or {@code gcp} is less than negative. + */ + public static void qualToTransProbsLog10(final double[] dest, final byte insQual, final byte delQual, final byte gcp) { + if (insQual < 0) throw new IllegalArgumentException("insert quality cannot less than 0: " + insQual); + if (delQual < 0) throw new IllegalArgumentException("deletion quality cannot be less than 0: " + delQual); + if (gcp < 0) throw new IllegalArgumentException("gcp cannot be less than 0: " + gcp); + dest[matchToMatch] = matchToMatchProbLog10(insQual, delQual); + dest[matchToInsertion] = QualityUtils.qualToErrorProbLog10(insQual); + dest[matchToDeletion] = QualityUtils.qualToErrorProbLog10(delQual); + dest[indelToMatch] = QualityUtils.qualToProbLog10(gcp); + dest[insertionToInsertion] = dest[deletionToDeletion] = QualityUtils.qualToErrorProbLog10(gcp); + } + + /** + * Returns a transition log10 probability array given the different quality scores affecting a read site. + * + * @param insQual the insertion quality score as a byte. + * @param delQual the deletion quality score as a byte. + * @param gcp the gap-continuation-penalty score as a byte. + * + * @throws NullPointerException if {@code dest} is {@code null}. + * @throws ArrayIndexOutOfBoundsException if {@code dest} is not large enough. + * @throws IllegalArgumentException if {@code insQual}, {@code delQual} or {@code gcp} is less than negative. + * + * @return never {@code null}. An array of length {@link #TRANS_PROB_ARRAY_LENGTH}. + */ + @SuppressWarnings("unused") + public static double[] qualToTransProbsLog10(final byte insQual, final byte delQual, final byte gcp) { + final double[] dest = new double[TRANS_PROB_ARRAY_LENGTH]; + qualToTransProbsLog10(dest,insQual,delQual,gcp); + return dest; + } + + /** + * Fills a matrix with the log10 transition probabilities for a number of bases. + * + *

+ * The first dimension of the matrix correspond to the different bases where the first one is stored in position 1. + * Thus the position 0 is left empty and the length of the resulting matrix is actually {@code insQual.length + 1}. + *

+ * Each entry is the transition probability array for that base with a length of {@link #TRANS_PROB_ARRAY_LENGTH}. + * + * @param insQuals insertion qualities. + * @param delQuals deletion qualities. + * @param gcps gap-continuation penalty qualities. + * + * @throws NullPointerException if any of the input arrays, matrices is {@code null} or any entry in {@code dest} is {@code null}. + * @throws IllegalArgumentException if {@code IllegalArgumentException} + * if the input array don't have the same length. + * @throws ArrayIndexOutOfBoundsException if {@code dest} or any of its elements is not large enough to contain the + * transition matrix. + */ + @SuppressWarnings("unused") + public static void qualToTransProbsLog10(final double[][] dest, final byte[] insQuals, final byte[] delQuals, final byte[] gcps) { + final int readLength = insQuals.length; + if (delQuals.length != readLength) throw new IllegalArgumentException("deletion quality array length does not match insert quality array length: " + readLength + " != " + delQuals.length); + if (gcps.length != readLength) throw new IllegalArgumentException("deletion quality array length does not match insert quality array length: " + readLength + " != " + gcps.length); + if (dest.length < readLength + 1) throw new IllegalArgumentException("destination length is not enough for the read length: " + dest.length + " < " + readLength + " + 1"); + + for (int i = 0; i < readLength; i++) + qualToTransProbsLog10(dest[i+1],insQuals[i],delQuals[i],gcps[i]); + } + + /** + * Returns a matrix with the log10 transition probabilities for a number of bases. + * + *

+ * The first dimension of the matrix correspond to the different bases where the first one is stored in position 1. + * Thus the position 0 is left empty and the length of the resulting matrix is actually {@code insQual.length + 1}. + *

+ * Each entry is the transition probability array for that base with a length of {@link #TRANS_PROB_ARRAY_LENGTH}. + * + * @param insQuals insertion qualities. + * @param delQuals deletion qualities. + * @param gcps gap-continuation penalty qualities. + * + * @throws NullPointerException if any of the input arrays is {@code null}. + * @throws IllegalArgumentException if {@code IllegalArgumentException} + * if the input array don't have the same length. + * + * @return never {@code null}, an matrix of the dimensions explained above. + */ + @SuppressWarnings("unused") + public static double[][] qualToTransProbsLog10(final byte[] insQuals, final byte[] delQuals, final byte[] gcps) { + final double[][] dest = createTransitionMatrix(insQuals.length); + qualToTransProbsLog10(dest,insQuals,delQuals,gcps); + return dest; + } + + /** + * Creates a transition probability matrix large enough to work with sequences of a particular length. + * + * @param maxReadLength the maximum read length for the transition matrix. + * + * @return never {@code null}. A matrix of {@code maxReadLength + 1} by {@link #TRANS_PROB_ARRAY_LENGTH} positions. + */ + public static double[][] createTransitionMatrix(final int maxReadLength) { + return new double[maxReadLength + 1][TRANS_PROB_ARRAY_LENGTH]; + } + + /** + * Returns the probability that neither of two event takes place. + *

+ * + * We assume that both event never occur together and that delQual is the conditional probability + * (qual. encoded) of the second event, given the first event didn't took place. So that the + * probability of no event is:
+ * + * We assume that both event never occur together so that the probability of no event is:
+ * + * 1 - ProbErr(insQual) - ProbErr(delQual)
+ * + * @param insQual PhRED scaled quality/probability of the first event. + * @param delQual PhRED scaled quality/probability of the second event. + * + * @return a value between 0 and 1. + */ + public static double matchToMatchProb(final byte insQual, final byte delQual) { + return matchToMatchProb((insQual & 0xFF), (delQual & 0xFF)); + } + + /** + * Returns the probability (log 10 scaled) that neither of two event, insertion and deletion, takes place. + *

+ * + * We assume that both event never occur together so that the probability of no event is:
+ * + * 1 - ProbErr(insQual) - ProbErr(delQual)
+ * + * @param insQual PhRED scaled quality/probability of an insertion. + * @param delQual PhRED scaled quality/probability of a deletion. + * + * @return a value between 0 and -Inf. + */ + public static double matchToMatchProbLog10(final byte insQual, final byte delQual) { + return matchToMatchProbLog10((insQual & 0xFF), (delQual & 0xFF)); + } + + /** + * Returns the probability that neither of two events, insertion and deletion, takes place. + *

+ * + * We assume that both event never occur together and that delQual is the conditional probability + * (qual. encoded) of the second event, given the first event didn't took place. So that the + * probability of no event is:
+ * + * We assume that both event never occur together so that the probability of no event is:
+ * + * 1 - ProbErr(insQual) - ProbErr(delQual)
+ * + * @param insQual PhRED scaled quality/probability of an insertion. + * @param delQual PhRED scaled quality/probability of a deletion. + * @return a value between 0 and 1. + */ + public static double matchToMatchProb(final int insQual, final int delQual) { + final int minQual; + final int maxQual; + if (insQual <= delQual) { + minQual = insQual; + maxQual = delQual; + } else { + minQual = delQual; + maxQual = insQual; + } + + if (minQual < 0) throw new IllegalArgumentException("quality cannot be negative: " + minQual + " and " + maxQual); + + return (QualityUtils.MAX_QUAL < maxQual) ? 1.0 - Math.pow(10, MathUtils.approximateLog10SumLog10(-0.1 * minQual, -0.1 * maxQual)) : + matchToMatchProb[((maxQual * (maxQual + 1)) >> 1) + minQual]; + } + + /** + * Returns the probability (log 10 scaled) that neither of two event takes place. + *

+ * + * We assume that both event never occur together and that delQual is the conditional probability (qual. encoded) + * of the second event, given the first event didn't took place. So that the probability of no event is:
+ * + * We assume that both event never occur together so that the probability of no event is:
+ * + * 1 - ProbErr(insQual) - ProbErr(delQual)
+ * + * @param insQual PhRED scaled quality/probability of an insertion. + * @param delQual PhRED scaled quality/probability of a deletion. + * + * @return a value between 0 and -Inf. + */ + public static double matchToMatchProbLog10(final int insQual, final int delQual) { + final int minQual; + final int maxQual; + if (insQual <= delQual) { + minQual = insQual; + maxQual = delQual; + } else { + minQual = delQual; + maxQual = insQual; + } + return (QualityUtils.MAX_QUAL < maxQual) ? Math.log1p ( + - Math.min(1,Math.pow(10, + MathUtils.approximateLog10SumLog10(-.1 * minQual, -.1 * maxQual)))) * INV_LN10 : + matchToMatchLog10[((maxQual * (maxQual + 1)) >> 1) + minQual]; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMMReadyHaplotypes.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pairhmm/PairHMMReadyHaplotypes.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMMReadyHaplotypes.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pairhmm/PairHMMReadyHaplotypes.java diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/MergingPileupElementIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pileup/MergingPileupElementIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/pileup/MergingPileupElementIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pileup/MergingPileupElementIterator.java diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pileup/PileupElement.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pileup/PileupElement.java diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElementFilter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pileup/PileupElementFilter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/pileup/PileupElementFilter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pileup/PileupElementFilter.java diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElementTracker.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pileup/PileupElementTracker.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/pileup/PileupElementTracker.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pileup/PileupElementTracker.java diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup2/Notes b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pileup2/Notes similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/pileup2/Notes rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pileup2/Notes diff --git a/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java diff --git a/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemon.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemon.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemon.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemon.java diff --git a/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeterData.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/progressmeter/ProgressMeterData.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeterData.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/progressmeter/ProgressMeterData.java diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRArgumentSet.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/recalibration/BQSRArgumentSet.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRArgumentSet.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/recalibration/BQSRArgumentSet.java diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRMode.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/recalibration/BQSRMode.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRMode.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/recalibration/BQSRMode.java diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/EventType.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/recalibration/EventType.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/recalibration/EventType.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/recalibration/EventType.java diff --git a/public/java/src/org/broadinstitute/sting/utils/runtime/CapturedStreamOutput.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/runtime/CapturedStreamOutput.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/runtime/CapturedStreamOutput.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/runtime/CapturedStreamOutput.java diff --git a/public/java/src/org/broadinstitute/sting/utils/runtime/InputStreamSettings.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/runtime/InputStreamSettings.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/runtime/InputStreamSettings.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/runtime/InputStreamSettings.java diff --git a/public/java/src/org/broadinstitute/sting/utils/runtime/OutputStreamSettings.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/runtime/OutputStreamSettings.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/runtime/OutputStreamSettings.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/runtime/OutputStreamSettings.java diff --git a/public/java/src/org/broadinstitute/sting/utils/runtime/ProcessController.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/runtime/ProcessController.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/runtime/ProcessController.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/runtime/ProcessController.java diff --git a/public/java/src/org/broadinstitute/sting/utils/runtime/ProcessOutput.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/runtime/ProcessOutput.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/runtime/ProcessOutput.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/runtime/ProcessOutput.java diff --git a/public/java/src/org/broadinstitute/sting/utils/runtime/ProcessSettings.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/runtime/ProcessSettings.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/runtime/ProcessSettings.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/runtime/ProcessSettings.java diff --git a/public/java/src/org/broadinstitute/sting/utils/runtime/RuntimeUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/runtime/RuntimeUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/runtime/RuntimeUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/runtime/RuntimeUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/runtime/StreamLocation.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/runtime/StreamLocation.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/runtime/StreamLocation.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/runtime/StreamLocation.java diff --git a/public/java/src/org/broadinstitute/sting/utils/runtime/StreamOutput.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/runtime/StreamOutput.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/runtime/StreamOutput.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/runtime/StreamOutput.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentStartComparator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/AlignmentStartComparator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/AlignmentStartComparator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/AlignmentStartComparator.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentStartWithNoTiesComparator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/AlignmentStartWithNoTiesComparator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/AlignmentStartWithNoTiesComparator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/AlignmentStartWithNoTiesComparator.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/AlignmentUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/AlignmentUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialMultiSampleReadStream.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialMultiSampleReadStream.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/ArtificialMultiSampleReadStream.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialMultiSampleReadStream.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialPatternedSAMIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialPatternedSAMIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/ArtificialPatternedSAMIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialPatternedSAMIterator.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialReadsTraversal.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialReadsTraversal.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/ArtificialReadsTraversal.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialReadsTraversal.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMFileReader.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialSAMFileReader.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMFileReader.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialSAMFileReader.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialSAMIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialSAMIterator.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMQueryIterator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialSAMQueryIterator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMQueryIterator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialSAMQueryIterator.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStream.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStream.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStream.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStream.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamAnalyzer.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamAnalyzer.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamAnalyzer.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamAnalyzer.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialStingSAMFileWriter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialStingSAMFileWriter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/ArtificialStingSAMFileWriter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialStingSAMFileWriter.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/BySampleSAMFileWriter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/BySampleSAMFileWriter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/BySampleSAMFileWriter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/BySampleSAMFileWriter.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/CigarUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/CigarUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/CigarUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/CigarUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSamRecordFactory.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/GATKSamRecordFactory.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/GATKSamRecordFactory.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/GATKSamRecordFactory.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/MisencodedBaseQualityReadTransformer.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/MisencodedBaseQualityReadTransformer.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/MisencodedBaseQualityReadTransformer.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/MisencodedBaseQualityReadTransformer.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUnclippedStartWithNoTiesComparator.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ReadUnclippedStartWithNoTiesComparator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/ReadUnclippedStartWithNoTiesComparator.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ReadUnclippedStartWithNoTiesComparator.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ReadUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ReadUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/SAMFileReaderBuilder.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/SAMFileReaderBuilder.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/SAMFileReaderBuilder.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/SAMFileReaderBuilder.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/SimplifyingSAMFileWriter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/SimplifyingSAMFileWriter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/SimplifyingSAMFileWriter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/SimplifyingSAMFileWriter.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/package-info.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/package-info.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/sam/package-info.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/package-info.java diff --git a/public/java/src/org/broadinstitute/sting/utils/smithwaterman/GlobalEdgeGreedySWPairwiseAlignment.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/smithwaterman/GlobalEdgeGreedySWPairwiseAlignment.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/smithwaterman/GlobalEdgeGreedySWPairwiseAlignment.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/smithwaterman/GlobalEdgeGreedySWPairwiseAlignment.java diff --git a/public/java/src/org/broadinstitute/sting/utils/smithwaterman/Parameters.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/smithwaterman/Parameters.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/smithwaterman/Parameters.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/smithwaterman/Parameters.java diff --git a/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignment.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignment.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignment.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignment.java diff --git a/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignmentMain.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignmentMain.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignmentMain.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignmentMain.java diff --git a/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWParameterSet.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/smithwaterman/SWParameterSet.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWParameterSet.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/smithwaterman/SWParameterSet.java diff --git a/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SmithWaterman.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/smithwaterman/SmithWaterman.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/smithwaterman/SmithWaterman.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/smithwaterman/SmithWaterman.java diff --git a/public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/text/ListFileUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/text/ListFileUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/text/TextFormattingUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/text/TextFormattingUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/text/TextFormattingUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/text/TextFormattingUtils.java diff --git a/public/java/src/org/broadinstitute/sting/utils/text/XReadLines.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/text/XReadLines.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/text/XReadLines.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/text/XReadLines.java diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactory.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactory.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactory.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactory.java diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/NamedThreadFactory.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/threading/NamedThreadFactory.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/threading/NamedThreadFactory.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/threading/NamedThreadFactory.java diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/ThreadEfficiencyMonitor.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/threading/ThreadEfficiencyMonitor.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/threading/ThreadEfficiencyMonitor.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/threading/ThreadEfficiencyMonitor.java diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/ThreadLocalArray.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/threading/ThreadLocalArray.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/threading/ThreadLocalArray.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/threading/ThreadLocalArray.java diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/ThreadPoolMonitor.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/threading/ThreadPoolMonitor.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/threading/ThreadPoolMonitor.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/threading/ThreadPoolMonitor.java diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/package-info.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/threading/package-info.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/threading/package-info.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/threading/package-info.java diff --git a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVCFIndexType.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/variant/GATKVCFIndexType.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/variant/GATKVCFIndexType.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/variant/GATKVCFIndexType.java diff --git a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVCFUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/variant/GATKVCFUtils.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/variant/GATKVCFUtils.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/variant/GATKVCFUtils.java diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java new file mode 100644 index 000000000..fb5564ab3 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java @@ -0,0 +1,2206 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.variant; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.apache.commons.lang.ArrayUtils; +import org.apache.log4j.Logger; +import org.broad.tribble.TribbleException; +import org.broad.tribble.util.popgen.HardyWeinbergCalculation; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.variant.variantcontext.*; +import org.broadinstitute.variant.vcf.VCFConstants; + +import java.io.Serializable; +import java.util.*; + +public class GATKVariantContextUtils { + + private static Logger logger = Logger.getLogger(GATKVariantContextUtils.class); + + public static final int DEFAULT_PLOIDY = 2; + public static final double SUM_GL_THRESH_NOCALL = -0.1; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call. + + public final static List NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + public final static String NON_REF_SYMBOLIC_ALLELE_NAME = "NON_REF"; + public final static Allele NON_REF_SYMBOLIC_ALLELE = Allele.create("<"+NON_REF_SYMBOLIC_ALLELE_NAME+">", false); // represents any possible non-ref allele at this site + + public final static String MERGE_FILTER_PREFIX = "filterIn"; + public final static String MERGE_REF_IN_ALL = "ReferenceInAll"; + public final static String MERGE_FILTER_IN_ALL = "FilteredInAll"; + public final static String MERGE_INTERSECTION = "Intersection"; + + public enum GenotypeMergeType { + /** + * Make all sample genotypes unique by file. Each sample shared across RODs gets named sample.ROD. + */ + UNIQUIFY, + /** + * Take genotypes in priority order (see the priority argument). + */ + PRIORITIZE, + /** + * Take the genotypes in any order. + */ + UNSORTED, + /** + * Require that all samples/genotypes be unique between all inputs. + */ + REQUIRE_UNIQUE + } + + public enum FilteredRecordMergeType { + /** + * Union - leaves the record if any record is unfiltered. + */ + KEEP_IF_ANY_UNFILTERED, + /** + * Requires all records present at site to be unfiltered. VCF files that don't contain the record don't influence this. + */ + KEEP_IF_ALL_UNFILTERED, + /** + * If any record is present at this site (regardless of possibly being filtered), then all such records are kept and the filters are reset. + */ + KEEP_UNCONDITIONAL + } + + public enum MultipleAllelesMergeType { + /** + * Combine only alleles of the same type (SNP, indel, etc.) into a single VCF record. + */ + BY_TYPE, + /** + * Merge all allele types at the same start position into the same VCF record. + */ + MIX_TYPES + } + + /** + * Refactored out of the AverageAltAlleleLength annotation class + * @param vc the variant context + * @return the average length of the alt allele (a double) + */ + public static double getMeanAltAlleleLength(VariantContext vc) { + double averageLength = 1.0; + if ( ! vc.isSNP() && ! vc.isSymbolic() ) { + // adjust for the event length + int averageLengthNum = 0; + int averageLengthDenom = 0; + int refLength = vc.getReference().length(); + for ( final Allele a : vc.getAlternateAlleles() ) { + int numAllele = vc.getCalledChrCount(a); + int alleleSize; + if ( a.length() == refLength ) { + // SNP or MNP + byte[] a_bases = a.getBases(); + byte[] ref_bases = vc.getReference().getBases(); + int n_mismatch = 0; + for ( int idx = 0; idx < a_bases.length; idx++ ) { + if ( a_bases[idx] != ref_bases[idx] ) + n_mismatch++; + } + alleleSize = n_mismatch; + } + else if ( a.isSymbolic() ) { + alleleSize = 1; + } else { + alleleSize = Math.abs(refLength-a.length()); + } + averageLengthNum += alleleSize*numAllele; + averageLengthDenom += numAllele; + } + averageLength = ( (double) averageLengthNum )/averageLengthDenom; + } + + return averageLength; + } + + /** + * create a genome location, given a variant context + * @param genomeLocParser parser + * @param vc the variant context + * @return the genomeLoc + */ + public static final GenomeLoc getLocation(GenomeLocParser genomeLocParser,VariantContext vc) { + return genomeLocParser.createGenomeLoc(vc.getChr(), vc.getStart(), vc.getEnd(), true); + } + + public static BaseUtils.BaseSubstitutionType getSNPSubstitutionType(VariantContext context) { + if (!context.isSNP() || !context.isBiallelic()) + throw new IllegalStateException("Requested SNP substitution type for bialleic non-SNP " + context); + return BaseUtils.SNPSubstitutionType(context.getReference().getBases()[0], context.getAlternateAllele(0).getBases()[0]); + } + + /** + * If this is a BiAllelic SNP, is it a transition? + */ + public static boolean isTransition(VariantContext context) { + return getSNPSubstitutionType(context) == BaseUtils.BaseSubstitutionType.TRANSITION; + } + + /** + * If this is a BiAllelic SNP, is it a transversion? + */ + public static boolean isTransversion(VariantContext context) { + return getSNPSubstitutionType(context) == BaseUtils.BaseSubstitutionType.TRANSVERSION; + } + + public static boolean isTransition(Allele ref, Allele alt) { + return BaseUtils.SNPSubstitutionType(ref.getBases()[0], alt.getBases()[0]) == BaseUtils.BaseSubstitutionType.TRANSITION; + } + + public static boolean isTransversion(Allele ref, Allele alt) { + return BaseUtils.SNPSubstitutionType(ref.getBases()[0], alt.getBases()[0]) == BaseUtils.BaseSubstitutionType.TRANSVERSION; + } + + /** + * Returns a context identical to this with the REF and ALT alleles reverse complemented. + * + * @param vc variant context + * @return new vc + */ + public static VariantContext reverseComplement(VariantContext vc) { + // create a mapping from original allele to reverse complemented allele + HashMap alleleMap = new HashMap<>(vc.getAlleles().size()); + for ( final Allele originalAllele : vc.getAlleles() ) { + Allele newAllele; + if ( originalAllele.isNoCall() ) + newAllele = originalAllele; + else + newAllele = Allele.create(BaseUtils.simpleReverseComplement(originalAllele.getBases()), originalAllele.isReference()); + alleleMap.put(originalAllele, newAllele); + } + + // create new Genotype objects + GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); + for ( final Genotype genotype : vc.getGenotypes() ) { + List newAlleles = new ArrayList<>(); + for ( final Allele allele : genotype.getAlleles() ) { + Allele newAllele = alleleMap.get(allele); + if ( newAllele == null ) + newAllele = Allele.NO_CALL; + newAlleles.add(newAllele); + } + newGenotypes.add(new GenotypeBuilder(genotype).alleles(newAlleles).make()); + } + + return new VariantContextBuilder(vc).alleles(alleleMap.values()).genotypes(newGenotypes).make(); + } + + /** + * Returns true iff VC is an non-complex indel where every allele represents an expansion or + * contraction of a series of identical bases in the reference. + * + * For example, suppose the ref bases are CTCTCTGA, which includes a 3x repeat of CTCTCT + * + * If VC = -/CT, then this function returns true because the CT insertion matches exactly the + * upcoming reference. + * If VC = -/CTA then this function returns false because the CTA isn't a perfect match + * + * Now consider deletions: + * + * If VC = CT/- then again the same logic applies and this returns true + * The case of CTA/- makes no sense because it doesn't actually match the reference bases. + * + * The logic of this function is pretty simple. Take all of the non-null alleles in VC. For + * each insertion allele of n bases, check if that allele matches the next n reference bases. + * For each deletion allele of n bases, check if this matches the reference bases at n - 2 n, + * as it must necessarily match the first n bases. If this test returns true for all + * alleles you are a tandem repeat, otherwise you are not. + * + * @param vc + * @param refBasesStartingAtVCWithPad not this is assumed to include the PADDED reference + * @return + */ + @Requires({"vc != null", "refBasesStartingAtVCWithPad != null && refBasesStartingAtVCWithPad.length > 0"}) + public static boolean isTandemRepeat(final VariantContext vc, final byte[] refBasesStartingAtVCWithPad) { + final String refBasesStartingAtVCWithoutPad = new String(refBasesStartingAtVCWithPad).substring(1); + if ( ! vc.isIndel() ) // only indels are tandem repeats + return false; + + final Allele ref = vc.getReference(); + + for ( final Allele allele : vc.getAlternateAlleles() ) { + if ( ! isRepeatAllele(ref, allele, refBasesStartingAtVCWithoutPad) ) + return false; + } + + // we've passed all of the tests, so we are a repeat + return true; + } + + /** + * + * @param vc + * @param refBasesStartingAtVCWithPad + * @return + */ + @Requires({"vc != null", "refBasesStartingAtVCWithPad != null && refBasesStartingAtVCWithPad.length > 0"}) + public static Pair,byte[]> getNumTandemRepeatUnits(final VariantContext vc, final byte[] refBasesStartingAtVCWithPad) { + final boolean VERBOSE = false; + final String refBasesStartingAtVCWithoutPad = new String(refBasesStartingAtVCWithPad).substring(1); + if ( ! vc.isIndel() ) // only indels are tandem repeats + return null; + + final Allele refAllele = vc.getReference(); + final byte[] refAlleleBases = Arrays.copyOfRange(refAllele.getBases(), 1, refAllele.length()); + + byte[] repeatUnit = null; + final ArrayList lengths = new ArrayList<>(); + + for ( final Allele allele : vc.getAlternateAlleles() ) { + Pair result = getNumTandemRepeatUnits(refAlleleBases, Arrays.copyOfRange(allele.getBases(), 1, allele.length()), refBasesStartingAtVCWithoutPad.getBytes()); + + final int[] repetitionCount = result.first; + // repetition count = 0 means allele is not a tandem expansion of context + if (repetitionCount[0] == 0 || repetitionCount[1] == 0) + return null; + + if (lengths.size() == 0) { + lengths.add(repetitionCount[0]); // add ref allele length only once + } + lengths.add(repetitionCount[1]); // add this alt allele's length + + repeatUnit = result.second; + if (VERBOSE) { + System.out.println("RefContext:"+refBasesStartingAtVCWithoutPad); + System.out.println("Ref:"+refAllele.toString()+" Count:" + String.valueOf(repetitionCount[0])); + System.out.println("Allele:"+allele.toString()+" Count:" + String.valueOf(repetitionCount[1])); + System.out.println("RU:"+new String(repeatUnit)); + } + } + + return new Pair, byte[]>(lengths,repeatUnit); + } + + public static Pair getNumTandemRepeatUnits(final byte[] refBases, final byte[] altBases, final byte[] remainingRefContext) { + /* we can't exactly apply same logic as in basesAreRepeated() to compute tandem unit and number of repeated units. + Consider case where ref =ATATAT and we have an insertion of ATAT. Natural description is (AT)3 -> (AT)2. + */ + + byte[] longB; + // find first repeat unit based on either ref or alt, whichever is longer + if (altBases.length > refBases.length) + longB = altBases; + else + longB = refBases; + + // see if non-null allele (either ref or alt, whichever is longer) can be decomposed into several identical tandem units + // for example, -*,CACA needs to first be decomposed into (CA)2 + final int repeatUnitLength = findRepeatedSubstring(longB); + final byte[] repeatUnit = Arrays.copyOf(longB, repeatUnitLength); + + final int[] repetitionCount = new int[2]; + // look for repetitions forward on the ref bases (i.e. starting at beginning of ref bases) + int repetitionsInRef = findNumberofRepetitions(repeatUnit,refBases, true); + repetitionCount[0] = findNumberofRepetitions(repeatUnit, ArrayUtils.addAll(refBases, remainingRefContext), true)-repetitionsInRef; + repetitionCount[1] = findNumberofRepetitions(repeatUnit, ArrayUtils.addAll(altBases, remainingRefContext), true)-repetitionsInRef; + + return new Pair<>(repetitionCount, repeatUnit); + + } + + /** + * Find out if a string can be represented as a tandem number of substrings. + * For example ACTACT is a 2-tandem of ACT, + * but ACTACA is not. + * + * @param bases String to be tested + * @return Length of repeat unit, if string can be represented as tandem of substring (if it can't + * be represented as one, it will be just the length of the input string) + */ + public static int findRepeatedSubstring(byte[] bases) { + + int repLength; + for (repLength=1; repLength <=bases.length; repLength++) { + final byte[] candidateRepeatUnit = Arrays.copyOf(bases,repLength); + boolean allBasesMatch = true; + for (int start = repLength; start < bases.length; start += repLength ) { + // check that remaining of string is exactly equal to repeat unit + final byte[] basePiece = Arrays.copyOfRange(bases,start,start+candidateRepeatUnit.length); + if (!Arrays.equals(candidateRepeatUnit, basePiece)) { + allBasesMatch = false; + break; + } + } + if (allBasesMatch) + return repLength; + } + + return repLength; + } + + /** + * Helper routine that finds number of repetitions a string consists of. + * For example, for string ATAT and repeat unit AT, number of repetitions = 2 + * @param repeatUnit Substring + * @param testString String to test + * @oaram lookForward Look for repetitions forward (at beginning of string) or backward (at end of string) + * @return Number of repetitions (0 if testString is not a concatenation of n repeatUnit's + */ + public static int findNumberofRepetitions(byte[] repeatUnit, byte[] testString, boolean lookForward) { + int numRepeats = 0; + if (lookForward) { + // look forward on the test string + for (int start = 0; start < testString.length; start += repeatUnit.length) { + int end = start + repeatUnit.length; + byte[] unit = Arrays.copyOfRange(testString,start, end); + if(Arrays.equals(unit,repeatUnit)) + numRepeats++; + else + break; + } + return numRepeats; + } + + // look backward. For example, if repeatUnit = AT and testString = GATAT, number of repeat units is still 2 + // look forward on the test string + for (int start = testString.length - repeatUnit.length; start >= 0; start -= repeatUnit.length) { + int end = start + repeatUnit.length; + byte[] unit = Arrays.copyOfRange(testString,start, end); + if(Arrays.equals(unit,repeatUnit)) + numRepeats++; + else + break; + } + return numRepeats; + } + + /** + * Helper function for isTandemRepeat that checks that allele matches somewhere on the reference + * @param ref + * @param alt + * @param refBasesStartingAtVCWithoutPad + * @return + */ + protected static boolean isRepeatAllele(final Allele ref, final Allele alt, final String refBasesStartingAtVCWithoutPad) { + if ( ! Allele.oneIsPrefixOfOther(ref, alt) ) + return false; // we require one allele be a prefix of another + + if ( ref.length() > alt.length() ) { // we are a deletion + return basesAreRepeated(ref.getBaseString(), alt.getBaseString(), refBasesStartingAtVCWithoutPad, 2); + } else { // we are an insertion + return basesAreRepeated(alt.getBaseString(), ref.getBaseString(), refBasesStartingAtVCWithoutPad, 1); + } + } + + protected static boolean basesAreRepeated(final String l, final String s, final String ref, final int minNumberOfMatches) { + final String potentialRepeat = l.substring(s.length()); // skip s bases + + for ( int i = 0; i < minNumberOfMatches; i++) { + final int start = i * potentialRepeat.length(); + final int end = (i+1) * potentialRepeat.length(); + if ( ref.length() < end ) + return false; // we ran out of bases to test + final String refSub = ref.substring(start, end); + if ( ! refSub.equals(potentialRepeat) ) + return false; // repeat didn't match, fail + } + + return true; // we passed all tests, we matched + } + + public enum GenotypeAssignmentMethod { + /** + * set all of the genotype GT values to NO_CALL + */ + SET_TO_NO_CALL, + + /** + * Use the subsetted PLs to greedily assigned genotypes + */ + USE_PLS_TO_ASSIGN, + + /** + * Try to match the original GT calls, if at all possible + * + * Suppose I have 3 alleles: A/B/C and the following samples: + * + * original_GT best_match to A/B best_match to A/C + * S1 => A/A A/A A/A + * S2 => A/B A/B A/A + * S3 => B/B B/B A/A + * S4 => B/C A/B A/C + * S5 => C/C A/A C/C + * + * Basically, all alleles not in the subset map to ref. It means that het-alt genotypes + * when split into 2 bi-allelic variants will be het in each, which is good in some cases, + * rather than the undetermined behavior when using the PLs to assign, which could result + * in hom-var or hom-ref for each, depending on the exact PL values. + */ + BEST_MATCH_TO_ORIGINAL, + + /** + * do not even bother changing the GTs + */ + DO_NOT_ASSIGN_GENOTYPES + } + + /** + * subset the Variant Context to the specific set of alleles passed in (pruning the PLs appropriately) + * + * @param vc variant context with genotype likelihoods + * @param allelesToUse which alleles from the vc are okay to use; *** must be in the same relative order as those in the original VC *** + * @param assignGenotypes assignment strategy for the (subsetted) PLs + * @return a new non-null GenotypesContext + */ + public static GenotypesContext subsetDiploidAlleles(final VariantContext vc, + final List allelesToUse, + final GenotypeAssignmentMethod assignGenotypes) { + if ( allelesToUse.get(0).isNonReference() ) throw new IllegalArgumentException("First allele must be the reference allele"); + if ( allelesToUse.size() == 1 ) throw new IllegalArgumentException("Cannot subset to only 1 alt allele"); + + // optimization: if no input genotypes, just exit + if (vc.getGenotypes().isEmpty()) return GenotypesContext.create(); + + // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward + final List likelihoodIndexesToUse = determineLikelihoodIndexesToUse(vc, allelesToUse); + + // create the new genotypes + return createGenotypesWithSubsettedLikelihoods(vc.getGenotypes(), vc, allelesToUse, likelihoodIndexesToUse, assignGenotypes); + } + + /** + * Figure out which likelihood indexes to use for a selected down set of alleles + * + * @param originalVC the original VariantContext + * @param allelesToUse the subset of alleles to use + * @return a list of PL indexes to use or null if none + */ + private static List determineLikelihoodIndexesToUse(final VariantContext originalVC, final List allelesToUse) { + + // the bitset representing the allele indexes we want to keep + final boolean[] alleleIndexesToUse = getAlleleIndexBitset(originalVC, allelesToUse); + + // an optimization: if we are supposed to use all (or none in the case of a ref call) of the alleles, + // then we can keep the PLs as is; otherwise, we determine which ones to keep + if ( MathUtils.countOccurrences(true, alleleIndexesToUse) == alleleIndexesToUse.length ) + return null; + + return getLikelihoodIndexes(originalVC, alleleIndexesToUse); + } + + /** + * Get the actual likelihoods indexes to use given the corresponding allele indexes + * + * @param originalVC the original VariantContext + * @param alleleIndexesToUse the bitset representing the alleles to use (@see #getAlleleIndexBitset) + * @return a non-null List + */ + private static List getLikelihoodIndexes(final VariantContext originalVC, final boolean[] alleleIndexesToUse) { + + final List result = new ArrayList<>(30); + + // numLikelihoods takes total # of alleles. Use default # of chromosomes (ploidy) = 2 + final int numLikelihoods = GenotypeLikelihoods.numLikelihoods(originalVC.getNAlleles(), DEFAULT_PLOIDY); + + for ( int PLindex = 0; PLindex < numLikelihoods; PLindex++ ) { + final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); + // consider this entry only if both of the alleles are good + if ( alleleIndexesToUse[alleles.alleleIndex1] && alleleIndexesToUse[alleles.alleleIndex2] ) + result.add(PLindex); + } + + return result; + } + + /** + * Given an original VariantContext and a list of alleles from that VC to keep, + * returns a bitset representing which allele indexes should be kept + * + * @param originalVC the original VC + * @param allelesToKeep the list of alleles to keep + * @return non-null bitset + */ + private static boolean[] getAlleleIndexBitset(final VariantContext originalVC, final List allelesToKeep) { + final int numOriginalAltAlleles = originalVC.getNAlleles() - 1; + final boolean[] alleleIndexesToKeep = new boolean[numOriginalAltAlleles + 1]; + + // the reference Allele is definitely still used + alleleIndexesToKeep[0] = true; + for ( int i = 0; i < numOriginalAltAlleles; i++ ) { + if ( allelesToKeep.contains(originalVC.getAlternateAllele(i)) ) + alleleIndexesToKeep[i+1] = true; + } + + return alleleIndexesToKeep; + } + + /** + * Create the new GenotypesContext with the subsetted PLs + * + * @param originalGs the original GenotypesContext + * @param vc the original VariantContext + * @param allelesToUse the actual alleles to use with the new Genotypes + * @param likelihoodIndexesToUse the indexes in the PL to use given the allelesToUse (@see #determineLikelihoodIndexesToUse()) + * @param assignGenotypes assignment strategy for the (subsetted) PLs + * @return a new non-null GenotypesContext + */ + private static GenotypesContext createGenotypesWithSubsettedLikelihoods(final GenotypesContext originalGs, + final VariantContext vc, + final List allelesToUse, + final List likelihoodIndexesToUse, + final GenotypeAssignmentMethod assignGenotypes) { + // the new genotypes to create + final GenotypesContext newGTs = GenotypesContext.create(originalGs.size()); + + // make sure we are seeing the expected number of likelihoods per sample + final int expectedNumLikelihoods = GenotypeLikelihoods.numLikelihoods(vc.getNAlleles(), 2); + + // the samples + final List sampleIndices = originalGs.getSampleNamesOrderedByName(); + + // create the new genotypes + for ( int k = 0; k < originalGs.size(); k++ ) { + final Genotype g = originalGs.get(sampleIndices.get(k)); + final GenotypeBuilder gb = new GenotypeBuilder(g); + + // create the new likelihoods array from the alleles we are allowed to use + double[] newLikelihoods; + if ( !g.hasLikelihoods() ) { + // we don't have any likelihoods, so we null out PLs and make G ./. + newLikelihoods = null; + gb.noPL(); + } else { + final double[] originalLikelihoods = g.getLikelihoods().getAsVector(); + if ( likelihoodIndexesToUse == null ) { + newLikelihoods = originalLikelihoods; + } else if ( originalLikelihoods.length != expectedNumLikelihoods ) { + logger.warn("Wrong number of likelihoods in sample " + g.getSampleName() + " at " + vc + " got " + g.getLikelihoodsString() + " but expected " + expectedNumLikelihoods); + newLikelihoods = null; + } else { + newLikelihoods = new double[likelihoodIndexesToUse.size()]; + int newIndex = 0; + for ( final int oldIndex : likelihoodIndexesToUse ) + newLikelihoods[newIndex++] = originalLikelihoods[oldIndex]; + + // might need to re-normalize + newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true); + } + + if ( newLikelihoods == null || likelihoodsAreUninformative(newLikelihoods) ) + gb.noPL(); + else + gb.PL(newLikelihoods); + } + + updateGenotypeAfterSubsetting(g.getAlleles(), gb, assignGenotypes, newLikelihoods, allelesToUse); + newGTs.add(gb.make()); + } + + return newGTs; + } + + private static boolean likelihoodsAreUninformative(final double[] likelihoods) { + return MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL; + } + + /** + * Add the genotype call (GT) field to GenotypeBuilder using the requested algorithm assignmentMethod + * + * @param originalGT the original genotype calls, cannot be null + * @param gb the builder where we should put our newly called alleles, cannot be null + * @param assignmentMethod the method to use to do the assignment, cannot be null + * @param newLikelihoods a vector of likelihoods to use if the method requires PLs, should be log10 likelihoods, cannot be null + * @param allelesToUse the alleles we are using for our subsetting + */ + public static void updateGenotypeAfterSubsetting(final List originalGT, + final GenotypeBuilder gb, + final GenotypeAssignmentMethod assignmentMethod, + final double[] newLikelihoods, + final List allelesToUse) { + switch ( assignmentMethod ) { + case DO_NOT_ASSIGN_GENOTYPES: + break; + case SET_TO_NO_CALL: + gb.alleles(NO_CALL_ALLELES); + gb.noGQ(); + break; + case USE_PLS_TO_ASSIGN: + if ( newLikelihoods == null || likelihoodsAreUninformative(newLikelihoods) ) { + // if there is no mass on the (new) likelihoods, then just no-call the sample + gb.alleles(NO_CALL_ALLELES); + gb.noGQ(); + } else { + // find the genotype with maximum likelihoods + final int PLindex = MathUtils.maxElementIndex(newLikelihoods); + GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); + gb.alleles(Arrays.asList(allelesToUse.get(alleles.alleleIndex1), allelesToUse.get(alleles.alleleIndex2))); + gb.log10PError(GenotypeLikelihoods.getGQLog10FromLikelihoods(PLindex, newLikelihoods)); + } + break; + case BEST_MATCH_TO_ORIGINAL: + final List best = new LinkedList<>(); + final Allele ref = allelesToUse.get(0); // WARNING -- should be checked in input argument + for ( final Allele originalAllele : originalGT ) { + best.add(allelesToUse.contains(originalAllele) ? originalAllele : ref); + } + gb.noGQ(); + gb.noPL(); + gb.alleles(best); + break; + } + } + + /** + * Subset the samples in VC to reference only information with ref call alleles + * + * Preserves DP if present + * + * @param vc the variant context to subset down to + * @param ploidy ploidy to use if a genotype doesn't have any alleles + * @return a GenotypesContext + */ + public static GenotypesContext subsetToRefOnly(final VariantContext vc, final int ploidy) { + if ( vc == null ) throw new IllegalArgumentException("vc cannot be null"); + if ( ploidy < 1 ) throw new IllegalArgumentException("ploidy must be >= 1 but got " + ploidy); + + // the genotypes with PLs + final GenotypesContext oldGTs = vc.getGenotypes(); + + // optimization: if no input genotypes, just exit + if (oldGTs.isEmpty()) return oldGTs; + + // the new genotypes to create + final GenotypesContext newGTs = GenotypesContext.create(oldGTs.size()); + + final Allele ref = vc.getReference(); + final List diploidRefAlleles = Arrays.asList(ref, ref); + + // create the new genotypes + for ( final Genotype g : vc.getGenotypes() ) { + final int gPloidy = g.getPloidy() == 0 ? ploidy : g.getPloidy(); + final List refAlleles = gPloidy == 2 ? diploidRefAlleles : Collections.nCopies(gPloidy, ref); + final GenotypeBuilder gb = new GenotypeBuilder(g.getSampleName(), refAlleles); + if ( g.hasDP() ) gb.DP(g.getDP()); + if ( g.hasGQ() ) gb.GQ(g.getGQ()); + newGTs.add(gb.make()); + } + + return newGTs; + } + + /** + * Assign genotypes (GTs) to the samples in the Variant Context greedily based on the PLs + * + * @param vc variant context with genotype likelihoods + * @return genotypes context + */ + public static GenotypesContext assignDiploidGenotypes(final VariantContext vc) { + return subsetDiploidAlleles(vc, vc.getAlleles(), GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN); + } + + /** + * Split variant context into its biallelic components if there are more than 2 alleles + * + * For VC has A/B/C alleles, returns A/B and A/C contexts. + * Genotypes are all no-calls now (it's not possible to fix them easily) + * Alleles are right trimmed to satisfy VCF conventions + * + * If vc is biallelic or non-variant it is just returned + * + * Chromosome counts are updated (but they are by definition 0) + * + * @param vc a potentially multi-allelic variant context + * @return a list of bi-allelic (or monomorphic) variant context + */ + public static List splitVariantContextToBiallelics(final VariantContext vc) { + return splitVariantContextToBiallelics(vc, false, GenotypeAssignmentMethod.SET_TO_NO_CALL); + } + + /** + * Split variant context into its biallelic components if there are more than 2 alleles + * + * For VC has A/B/C alleles, returns A/B and A/C contexts. + * Genotypes are all no-calls now (it's not possible to fix them easily) + * Alleles are right trimmed to satisfy VCF conventions + * + * If vc is biallelic or non-variant it is just returned + * + * Chromosome counts are updated (but they are by definition 0) + * + * @param vc a potentially multi-allelic variant context + * @param trimLeft if true, we will also left trim alleles, potentially moving the resulting vcs forward on the genome + * @return a list of bi-allelic (or monomorphic) variant context + */ + public static List splitVariantContextToBiallelics(final VariantContext vc, final boolean trimLeft, final GenotypeAssignmentMethod genotypeAssignmentMethod) { + if ( ! vc.isVariant() || vc.isBiallelic() ) + // non variant or biallelics already satisfy the contract + return Collections.singletonList(vc); + else { + final List biallelics = new LinkedList<>(); + + for ( final Allele alt : vc.getAlternateAlleles() ) { + VariantContextBuilder builder = new VariantContextBuilder(vc); + final List alleles = Arrays.asList(vc.getReference(), alt); + builder.alleles(alleles); + builder.genotypes(subsetDiploidAlleles(vc, alleles, genotypeAssignmentMethod)); + VariantContextUtils.calculateChromosomeCounts(builder, true); + final VariantContext trimmed = trimAlleles(builder.make(), trimLeft, true); + biallelics.add(trimmed); + } + + return biallelics; + } + } + + public static Genotype removePLsAndAD(final Genotype g) { + return ( g.hasLikelihoods() || g.hasAD() ) ? new GenotypeBuilder(g).noPL().noAD().make() : g; + } + + //TODO consider refactor variant-context merging code so that we share as much as possible between + //TODO simpleMerge and referenceConfidenceMerge + //TODO likely using a separate helper class or hierarchy. + /** + * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. + * If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with + * the sample name + * + * @param unsortedVCs collection of unsorted VCs + * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs + * @param filteredRecordMergeType merge type for filtered records + * @param genotypeMergeOptions merge option for genotypes + * @param annotateOrigin should we annotate the set it came from? + * @param printMessages should we print messages? + * @param setKey the key name of the set + * @param filteredAreUncalled are filtered records uncalled? + * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? + * @return new VariantContext representing the merge of unsortedVCs + */ + public static VariantContext simpleMerge(final Collection unsortedVCs, + final List priorityListOfVCs, + final FilteredRecordMergeType filteredRecordMergeType, + final GenotypeMergeType genotypeMergeOptions, + final boolean annotateOrigin, + final boolean printMessages, + final String setKey, + final boolean filteredAreUncalled, + final boolean mergeInfoWithMaxAC ) { + int originalNumOfVCs = priorityListOfVCs == null ? 0 : priorityListOfVCs.size(); + return simpleMerge(unsortedVCs, priorityListOfVCs, originalNumOfVCs, filteredRecordMergeType, genotypeMergeOptions, annotateOrigin, printMessages, setKey, filteredAreUncalled, mergeInfoWithMaxAC); + } + + /** + * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. + * If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with + * the sample name. + * simpleMerge does not verify any more unique sample names EVEN if genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE. One should use + * SampleUtils.verifyUniqueSamplesNames to check that before using simpleMerge. + * + * For more information on this method see: http://www.thedistractionnetwork.com/programmer-problem/ + * + * @param unsortedVCs collection of unsorted VCs + * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs + * @param filteredRecordMergeType merge type for filtered records + * @param genotypeMergeOptions merge option for genotypes + * @param annotateOrigin should we annotate the set it came from? + * @param printMessages should we print messages? + * @param setKey the key name of the set + * @param filteredAreUncalled are filtered records uncalled? + * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? + * @return new VariantContext representing the merge of unsortedVCs + */ + public static VariantContext simpleMerge(final Collection unsortedVCs, + final List priorityListOfVCs, + final int originalNumOfVCs, + final FilteredRecordMergeType filteredRecordMergeType, + final GenotypeMergeType genotypeMergeOptions, + final boolean annotateOrigin, + final boolean printMessages, + final String setKey, + final boolean filteredAreUncalled, + final boolean mergeInfoWithMaxAC ) { + if ( unsortedVCs == null || unsortedVCs.size() == 0 ) + return null; + + if (priorityListOfVCs != null && originalNumOfVCs != priorityListOfVCs.size()) + throw new IllegalArgumentException("the number of the original VariantContexts must be the same as the number of VariantContexts in the priority list"); + + if ( annotateOrigin && priorityListOfVCs == null && originalNumOfVCs == 0) + throw new IllegalArgumentException("Cannot merge calls and annotate their origins without a complete priority list of VariantContexts or the number of original VariantContexts"); + + final List preFilteredVCs = sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, genotypeMergeOptions); + // Make sure all variant contexts are padded with reference base in case of indels if necessary + List VCs = new ArrayList<>(); + + for (final VariantContext vc : preFilteredVCs) { + if ( ! filteredAreUncalled || vc.isNotFiltered() ) + VCs.add(vc); + } + + if ( VCs.size() == 0 ) // everything is filtered out and we're filteredAreUncalled + return null; + + // establish the baseline info from the first VC + final VariantContext first = VCs.get(0); + final String name = first.getSource(); + final Allele refAllele = determineReferenceAllele(VCs); + + final Set alleles = new LinkedHashSet<>(); + final Set filters = new HashSet<>(); + final Map attributes = new LinkedHashMap<>(); + final Set inconsistentAttributes = new HashSet<>(); + final Set variantSources = new HashSet<>(); // contains the set of sources we found in our set of VCs that are variant + final Set rsIDs = new LinkedHashSet<>(1); // most of the time there's one id + + VariantContext longestVC = first; + int depth = 0; + int maxAC = -1; + final Map attributesWithMaxAC = new LinkedHashMap<>(); + double log10PError = CommonInfo.NO_LOG10_PERROR; + boolean anyVCHadFiltersApplied = false; + VariantContext vcWithMaxAC = null; + GenotypesContext genotypes = GenotypesContext.create(); + + // counting the number of filtered and variant VCs + int nFiltered = 0; + + boolean remapped = false; + + // cycle through and add info from the other VCs, making sure the loc/reference matches + for ( final VariantContext vc : VCs ) { + if ( longestVC.getStart() != vc.getStart() ) + throw new IllegalStateException("BUG: attempting to merge VariantContexts with different start sites: first="+ first.toString() + " second=" + vc.toString()); + + if ( VariantContextUtils.getSize(vc) > VariantContextUtils.getSize(longestVC) ) + longestVC = vc; // get the longest location + + nFiltered += vc.isFiltered() ? 1 : 0; + if ( vc.isVariant() ) variantSources.add(vc.getSource()); + + AlleleMapper alleleMapping = resolveIncompatibleAlleles(refAllele, vc, alleles); + remapped = remapped || alleleMapping.needsRemapping(); + + alleles.addAll(alleleMapping.values()); + + mergeGenotypes(genotypes, vc, alleleMapping, genotypeMergeOptions == GenotypeMergeType.UNIQUIFY); + + // We always take the QUAL of the first VC with a non-MISSING qual for the combined value + if ( log10PError == CommonInfo.NO_LOG10_PERROR ) + log10PError = vc.getLog10PError(); + + filters.addAll(vc.getFilters()); + anyVCHadFiltersApplied |= vc.filtersWereApplied(); + + // + // add attributes + // + // special case DP (add it up) and ID (just preserve it) + // + if (vc.hasAttribute(VCFConstants.DEPTH_KEY)) + depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0); + if ( vc.hasID() ) rsIDs.add(vc.getID()); + if (mergeInfoWithMaxAC && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) { + String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY, null); + // lets see if the string contains a "," separator + if (rawAlleleCounts.contains(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)) { + final List alleleCountArray = Arrays.asList(rawAlleleCounts.substring(1, rawAlleleCounts.length() - 1).split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)); + for (final String alleleCount : alleleCountArray) { + final int ac = Integer.valueOf(alleleCount.trim()); + if (ac > maxAC) { + maxAC = ac; + vcWithMaxAC = vc; + } + } + } else { + final int ac = Integer.valueOf(rawAlleleCounts); + if (ac > maxAC) { + maxAC = ac; + vcWithMaxAC = vc; + } + } + } + + for (final Map.Entry p : vc.getAttributes().entrySet()) { + final String key = p.getKey(); + final Object value = p.getValue(); + // only output annotations that have the same value in every input VC + // if we don't like the key already, don't go anywhere + if ( ! inconsistentAttributes.contains(key) ) { + final boolean alreadyFound = attributes.containsKey(key); + final Object boundValue = attributes.get(key); + final boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4); + + if ( alreadyFound && ! boundValue.equals(value) && ! boundIsMissingValue ) { + // we found the value but we're inconsistent, put it in the exclude list + inconsistentAttributes.add(key); + attributes.remove(key); + } else if ( ! alreadyFound || boundIsMissingValue ) { // no value + attributes.put(key, value); + } + } + } + } + + // if we have more alternate alleles in the merged VC than in one or more of the + // original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF, and AD + for ( final VariantContext vc : VCs ) { + if (vc.getAlleles().size() == 1) + continue; + if ( hasPLIncompatibleAlleles(alleles, vc.getAlleles())) { + if ( ! genotypes.isEmpty() ) { + logger.debug(String.format("Stripping PLs at %s:%d-%d due to incompatible alleles merged=%s vs. single=%s", + vc.getChr(), vc.getStart(), vc.getEnd(), alleles, vc.getAlleles())); + } + genotypes = stripPLsAndAD(genotypes); + // this will remove stale AC,AF attributed from vc + VariantContextUtils.calculateChromosomeCounts(vc, attributes, true); + break; + } + } + + // take the VC with the maxAC and pull the attributes into a modifiable map + if ( mergeInfoWithMaxAC && vcWithMaxAC != null ) { + attributesWithMaxAC.putAll(vcWithMaxAC.getAttributes()); + } + + // if at least one record was unfiltered and we want a union, clear all of the filters + if ( (filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size()) || filteredRecordMergeType == FilteredRecordMergeType.KEEP_UNCONDITIONAL ) + filters.clear(); + + + if ( annotateOrigin ) { // we care about where the call came from + String setValue; + if ( nFiltered == 0 && variantSources.size() == originalNumOfVCs ) // nothing was unfiltered + setValue = MERGE_INTERSECTION; + else if ( nFiltered == VCs.size() ) // everything was filtered out + setValue = MERGE_FILTER_IN_ALL; + else if ( variantSources.isEmpty() ) // everyone was reference + setValue = MERGE_REF_IN_ALL; + else { + final LinkedHashSet s = new LinkedHashSet<>(); + for ( final VariantContext vc : VCs ) + if ( vc.isVariant() ) + s.add( vc.isFiltered() ? MERGE_FILTER_PREFIX + vc.getSource() : vc.getSource() ); + setValue = Utils.join("-", s); + } + + if ( setKey != null ) { + attributes.put(setKey, setValue); + if( mergeInfoWithMaxAC && vcWithMaxAC != null ) { + attributesWithMaxAC.put(setKey, setValue); + } + } + } + + if ( depth > 0 ) + attributes.put(VCFConstants.DEPTH_KEY, String.valueOf(depth)); + + final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs); + + final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID); + builder.loc(longestVC.getChr(), longestVC.getStart(), longestVC.getEnd()); + builder.alleles(alleles); + builder.genotypes(genotypes); + builder.log10PError(log10PError); + if ( anyVCHadFiltersApplied ) { + builder.filters(filters.isEmpty() ? filters : new TreeSet<>(filters)); + } + builder.attributes(new TreeMap<>(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes)); + + // Trim the padded bases of all alleles if necessary + final VariantContext merged = builder.make(); + if ( printMessages && remapped ) System.out.printf("Remapped => %s%n", merged); + return merged; + } + + private static Comparable combineAnnotationValues( final List array ) { + return MathUtils.median(array); // right now we take the median but other options could be explored + } + + /** + * Merges VariantContexts from gVCFs into a single hybrid. + * Assumes that none of the input records are filtered. + * + * @param VCs collection of unsorted genomic VCs + * @param loc the current location + * @param refBase the reference allele to use if all contexts in the VC are spanning (i.e. don't start at the location in loc); if null, we'll return null in this case + * @param removeNonRefSymbolicAllele if true, remove the allele from the merged VC + * @return new VariantContext representing the merge of all VCs or null if it not relevant + */ + public static VariantContext referenceConfidenceMerge(final List VCs, final GenomeLoc loc, final Byte refBase, final boolean removeNonRefSymbolicAllele) { + // this can happen if e.g. you are using a dbSNP file that spans a region with no gVCFs + if ( VCs == null || VCs.size() == 0 ) + return null; + + // establish the baseline info (sometimes from the first VC) + final VariantContext first = VCs.get(0); + final String name = first.getSource(); + + // ref allele + final Allele refAllele = determineReferenceAlleleGivenReferenceBase(VCs, loc, refBase); + if ( refAllele == null ) + return null; + + // FinalAlleleSet contains the alleles of the new resulting VC. + // Using linked set in order to guaranteed an stable order: + final LinkedHashSet finalAlleleSet = new LinkedHashSet<>(10); + // Reference goes first: + finalAlleleSet.add(refAllele); + + final Map attributes = new LinkedHashMap<>(); + final Set rsIDs = new LinkedHashSet<>(1); // most of the time there's one id + int depth = 0; + final Map> annotationMap = new LinkedHashMap<>(); + final GenotypesContext genotypes = GenotypesContext.create(); + + final int variantContextCount = VCs.size(); + // In this list we hold the mapping of each variant context alleles. + final List>> vcAndNewAllelePairs = new ArrayList<>(variantContextCount); + // cycle through and add info from the other VCs + for ( final VariantContext vc : VCs ) { + + // if this context doesn't start at the current location then it must be a spanning event (deletion or ref block) + final boolean isSpanningEvent = loc.getStart() != vc.getStart(); + + vcAndNewAllelePairs.add(new Pair<>(vc,isSpanningEvent ? replaceWithNoCalls(vc.getAlleles()) + : remapAlleles(vc.getAlleles(), refAllele, finalAlleleSet))); + } + + // Add to the end if at all required in in the output. + if (!removeNonRefSymbolicAllele) finalAlleleSet.add(NON_REF_SYMBOLIC_ALLELE); + + final List allelesList = new ArrayList<>(finalAlleleSet); + + for ( final Pair> pair : vcAndNewAllelePairs ) { + final VariantContext vc = pair.getFirst(); + final List remappedAlleles = pair.getSecond(); + + mergeRefConfidenceGenotypes(genotypes, vc, remappedAlleles, allelesList); + + // special case DP (add it up) for all events + if ( vc.hasAttribute(VCFConstants.DEPTH_KEY) ) + depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0); + else if ( vc.getNSamples() == 1 && vc.getGenotype(0).hasExtendedAttribute("MIN_DP") ) // handle the gVCF case from the HaplotypeCaller + depth += vc.getGenotype(0).getAttributeAsInt("MIN_DP", 0); + + if ( loc.getStart() != vc.getStart() ) + continue; + + // special case ID (just preserve it) + if ( vc.hasID() ) rsIDs.add(vc.getID()); + + // add attributes + addReferenceConfidenceAttributes(vc.getAttributes(), annotationMap); + } + + // when combining annotations use the median value from all input VCs which had annotations provided + for ( final Map.Entry> p : annotationMap.entrySet() ) { + if ( ! p.getValue().isEmpty() ) { + attributes.put(p.getKey(), combineAnnotationValues(p.getValue())); + } + } + + if ( depth > 0 ) + attributes.put(VCFConstants.DEPTH_KEY, String.valueOf(depth)); + + // remove stale AC and AF based attributes + removeStaleAttributesAfterMerge(attributes); + + final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs); + + final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID).alleles(allelesList) + .chr(loc.getContig()).start(loc.getStart()).computeEndFromAlleles(allelesList, loc.getStart(), loc.getStart()) + .genotypes(genotypes).unfiltered().attributes(new TreeMap<>(attributes)).log10PError(CommonInfo.NO_LOG10_PERROR); // we will need to regenotype later + + return builder.make(); + } + + /** + * Determines the ref allele given the provided reference base at this position + * + * @param VCs collection of unsorted genomic VCs + * @param loc the current location + * @param refBase the reference allele to use if all contexts in the VC are spanning + * @return new Allele or null if no reference allele/base is available + */ + private static Allele determineReferenceAlleleGivenReferenceBase(final List VCs, final GenomeLoc loc, final Byte refBase) { + final Allele refAllele = determineReferenceAllele(VCs, loc); + if ( refAllele == null ) + return ( refBase == null ? null : Allele.create(refBase, true) ); + return refAllele; + } + + /** + * Remove the stale attributes from the merged set + * + * @param attributes the attribute map + */ + private static void removeStaleAttributesAfterMerge(final Map attributes) { + attributes.remove(VCFConstants.ALLELE_COUNT_KEY); + attributes.remove(VCFConstants.ALLELE_FREQUENCY_KEY); + attributes.remove(VCFConstants.ALLELE_NUMBER_KEY); + attributes.remove(VCFConstants.MLE_ALLELE_COUNT_KEY); + attributes.remove(VCFConstants.MLE_ALLELE_FREQUENCY_KEY); + attributes.remove(VCFConstants.END_KEY); + } + /** + * Adds attributes to the global map from the new context in a sophisticated manner + * + * @param myAttributes attributes to add from + * @param annotationMap map of annotations for combining later + */ + private static void addReferenceConfidenceAttributes(final Map myAttributes, + final Map> annotationMap) { + for ( final Map.Entry p : myAttributes.entrySet() ) { + final String key = p.getKey(); + final Object value = p.getValue(); + + // add the annotation values to a list for combining later + List values = annotationMap.get(key); + if( values == null ) { + values = new ArrayList<>(); + annotationMap.put(key, values); + } + try { + final String stringValue = value.toString(); + // Branch to avoid unintentional, implicit type conversions that occur with the ? operator. + if (stringValue.contains(".")) + values.add(Double.parseDouble(stringValue)); + else + values.add(Integer.parseInt(stringValue)); + } catch (final NumberFormatException e) { + // nothing to do + } + } + } + + private static boolean hasPLIncompatibleAlleles(final Collection alleleSet1, final Collection alleleSet2) { + final Iterator it1 = alleleSet1.iterator(); + final Iterator it2 = alleleSet2.iterator(); + + while ( it1.hasNext() && it2.hasNext() ) { + final Allele a1 = it1.next(); + final Allele a2 = it2.next(); + if ( ! a1.equals(a2) ) + return true; + } + + // by this point, at least one of the iterators is empty. All of the elements + // we've compared are equal up until this point. But it's possible that the + // sets aren't the same size, which is indicated by the test below. If they + // are of the same size, though, the sets are compatible + return it1.hasNext() || it2.hasNext(); + } + + //TODO as part of a larger refactoring effort remapAlleles can be merged with createAlleleMapping. + /** + * This method does a couple of things: + *

  • + * remaps the vc alleles considering the differences between the final reference allele and its own reference,
  • + *
  • + * collects alternative alleles present in variant context and add them to the {@code finalAlleles} set. + *
+ * + * @param vcAlleles the variant context allele list. + * @param refAllele final reference allele. + * @param finalAlleles where to add the final set of non-ref called alleles. + * @return never {@code null} + */ + private static List remapAlleles(final List vcAlleles, final Allele refAllele, final LinkedHashSet finalAlleles) { + final Allele vcRef = vcAlleles.get(0); + if (!vcRef.isReference()) throw new IllegalStateException("the first allele of the vc allele list must be reference"); + final byte[] refBases = refAllele.getBases(); + final int extraBaseCount = refBases.length - vcRef.getBases().length; + if (extraBaseCount < 0) throw new IllegalStateException("the wrong reference was selected"); + final List result = new ArrayList<>(vcAlleles.size()); + + for (final Allele a : vcAlleles) { + if (a.isReference()) { + result.add(refAllele); + } else if (a.isSymbolic()) { + result.add(a); + // we always skip when adding to finalAlleles this is done outside if applies. + if (!a.equals(NON_REF_SYMBOLIC_ALLELE)) + finalAlleles.add(a); + } else if (a.isCalled()) { + final Allele newAllele; + if (extraBaseCount > 0) { + final byte[] oldBases = a.getBases(); + final byte[] newBases = Arrays.copyOf(oldBases,oldBases.length + extraBaseCount); + System.arraycopy(refBases,refBases.length - extraBaseCount,newBases,oldBases.length,extraBaseCount); + newAllele = Allele.create(newBases,false); + } else + newAllele = a; + result.add(newAllele); + finalAlleles.add(newAllele); + } else { // NO_CALL and strange miscellanea + result.add(a); + } + } + return result; + } + + public static GenotypesContext stripPLsAndAD(GenotypesContext genotypes) { + final GenotypesContext newGs = GenotypesContext.create(genotypes.size()); + + for ( final Genotype g : genotypes ) { + newGs.add(removePLsAndAD(g)); + } + + return newGs; + } + + /** + * Updates the PLs and AD of the Genotypes in the newly selected VariantContext to reflect the fact that some alleles + * from the original VariantContext are no longer present. + * + * @param selectedVC the selected (new) VariantContext + * @param originalVC the original VariantContext + * @return a new non-null GenotypesContext + */ + public static GenotypesContext updatePLsAndAD(final VariantContext selectedVC, final VariantContext originalVC) { + final int numNewAlleles = selectedVC.getAlleles().size(); + final int numOriginalAlleles = originalVC.getAlleles().size(); + + // if we have more alternate alleles in the selected VC than in the original VC, then something is wrong + if ( numNewAlleles > numOriginalAlleles ) + throw new IllegalArgumentException("Attempting to fix PLs and AD from what appears to be a *combined* VCF and not a selected one"); + + final GenotypesContext oldGs = selectedVC.getGenotypes(); + + // if we have the same number of alternate alleles in the selected VC as in the original VC, then we don't need to fix anything + if ( numNewAlleles == numOriginalAlleles ) + return oldGs; + + final GenotypesContext newGs = fixPLsFromSubsettedAlleles(oldGs, originalVC, selectedVC.getAlleles()); + + return fixADFromSubsettedAlleles(newGs, originalVC, selectedVC.getAlleles()); + } + + /** + * Fix the PLs for the GenotypesContext of a VariantContext that has been subset + * + * @param originalGs the original GenotypesContext + * @param originalVC the original VariantContext + * @param allelesToUse the new (sub)set of alleles to use + * @return a new non-null GenotypesContext + */ + static private GenotypesContext fixPLsFromSubsettedAlleles(final GenotypesContext originalGs, final VariantContext originalVC, final List allelesToUse) { + + // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward + final List likelihoodIndexesToUse = determineLikelihoodIndexesToUse(originalVC, allelesToUse); + + // create the new genotypes + return createGenotypesWithSubsettedLikelihoods(originalGs, originalVC, allelesToUse, likelihoodIndexesToUse, GenotypeAssignmentMethod.DO_NOT_ASSIGN_GENOTYPES); + } + + /** + * Fix the AD for the GenotypesContext of a VariantContext that has been subset + * + * @param originalGs the original GenotypesContext + * @param originalVC the original VariantContext + * @param allelesToUse the new (sub)set of alleles to use + * @return a new non-null GenotypesContext + */ + static private GenotypesContext fixADFromSubsettedAlleles(final GenotypesContext originalGs, final VariantContext originalVC, final List allelesToUse) { + + // the bitset representing the allele indexes we want to keep + final boolean[] alleleIndexesToUse = getAlleleIndexBitset(originalVC, allelesToUse); + + // the new genotypes to create + final GenotypesContext newGTs = GenotypesContext.create(originalGs.size()); + + // the samples + final List sampleIndices = originalGs.getSampleNamesOrderedByName(); + + // create the new genotypes + for ( int k = 0; k < originalGs.size(); k++ ) { + final Genotype g = originalGs.get(sampleIndices.get(k)); + newGTs.add(fixAD(g, alleleIndexesToUse, allelesToUse.size())); + } + + return newGTs; + } + + /** + * Fix the AD for the given Genotype + * + * @param genotype the original Genotype + * @param alleleIndexesToUse a bitset describing whether or not to keep a given index + * @param nAllelesToUse how many alleles we are keeping + * @return a non-null Genotype + */ + private static Genotype fixAD(final Genotype genotype, final boolean[] alleleIndexesToUse, final int nAllelesToUse) { + // if it ain't broke don't fix it + if ( !genotype.hasAD() ) + return genotype; + + final GenotypeBuilder builder = new GenotypeBuilder(genotype); + + final int[] oldAD = genotype.getAD(); + if ( oldAD.length != alleleIndexesToUse.length ) { + builder.noAD(); + } else { + final int[] newAD = new int[nAllelesToUse]; + int currentIndex = 0; + for ( int i = 0; i < oldAD.length; i++ ) { + if ( alleleIndexesToUse[i] ) + newAD[currentIndex++] = oldAD[i]; + } + builder.AD(newAD); + } + return builder.make(); + } + + static private Allele determineReferenceAllele(final List VCs) { + return determineReferenceAllele(VCs, null); + } + + /** + * Determines the common reference allele + * + * @param VCs the list of VariantContexts + * @param loc if not null, ignore records that do not begin at this start location + * @return possibly null Allele + */ + static private Allele determineReferenceAllele(final List VCs, final GenomeLoc loc) { + Allele ref = null; + + for ( final VariantContext vc : VCs ) { + if ( contextMatchesLoc(vc, loc) ) { + final Allele myRef = vc.getReference(); + if ( ref == null || ref.length() < myRef.length() ) + ref = myRef; + else if ( ref.length() == myRef.length() && ! ref.equals(myRef) ) + throw new TribbleException(String.format("The provided variant file(s) have inconsistent references for the same position(s) at %s:%d, %s vs. %s", vc.getChr(), vc.getStart(), ref, myRef)); + } + } + + return ref; + } + + public static boolean contextMatchesLoc(final VariantContext vc, final GenomeLoc loc) { + return loc == null || loc.getStart() == vc.getStart(); + } + + static private AlleleMapper resolveIncompatibleAlleles(final Allele refAllele, final VariantContext vc, final Set allAlleles) { + if ( refAllele.equals(vc.getReference()) ) + return new AlleleMapper(vc); + else { + final Map map = createAlleleMapping(refAllele, vc, allAlleles); + map.put(vc.getReference(), refAllele); + return new AlleleMapper(map); + } + } + + /** + * Create an allele mapping for the given context where its reference allele must (potentially) be extended to the given allele + * + * The refAllele is the longest reference allele seen at this start site. + * So imagine it is: + * refAllele: ACGTGA + * myRef: ACGT + * myAlt: A + * + * We need to remap all of the alleles in vc to include the extra GA so that + * myRef => refAllele and myAlt => AGA + * + * @param refAllele the new (extended) reference allele + * @param oneVC the Variant Context to extend + * @param currentAlleles the list of alleles already created + * @return a non-null mapping of original alleles to new (extended) ones + */ + private static Map createAlleleMapping(final Allele refAllele, + final VariantContext oneVC, + final Collection currentAlleles) { + final Allele myRef = oneVC.getReference(); + if ( refAllele.length() <= myRef.length() ) throw new IllegalStateException("BUG: myRef="+myRef+" is longer than refAllele="+refAllele); + + final byte[] extraBases = Arrays.copyOfRange(refAllele.getBases(), myRef.length(), refAllele.length()); + + final Map map = new HashMap<>(); + for ( final Allele a : oneVC.getAlternateAlleles() ) { + if ( isUsableAlternateAllele(a) ) { + Allele extended = Allele.extend(a, extraBases); + for ( final Allele b : currentAlleles ) + if ( extended.equals(b) ) + extended = b; + map.put(a, extended); + } + } + + return map; + } + + static private boolean isUsableAlternateAllele(final Allele allele) { + return ! (allele.isReference() || allele.isSymbolic() ); + } + + public static List sortVariantContextsByPriority(Collection unsortedVCs, List priorityListOfVCs, GenotypeMergeType mergeOption ) { + if ( mergeOption == GenotypeMergeType.PRIORITIZE && priorityListOfVCs == null ) + throw new IllegalArgumentException("Cannot merge calls by priority with a null priority list"); + + if ( priorityListOfVCs == null || mergeOption == GenotypeMergeType.UNSORTED ) + return new ArrayList<>(unsortedVCs); + else { + ArrayList sorted = new ArrayList<>(unsortedVCs); + Collections.sort(sorted, new CompareByPriority(priorityListOfVCs)); + return sorted; + } + } + + private static void mergeGenotypes(GenotypesContext mergedGenotypes, VariantContext oneVC, AlleleMapper alleleMapping, boolean uniquifySamples) { + //TODO: should we add a check for cases when the genotypeMergeOption is REQUIRE_UNIQUE + for ( final Genotype g : oneVC.getGenotypes() ) { + final String name = mergedSampleName(oneVC.getSource(), g.getSampleName(), uniquifySamples); + if ( ! mergedGenotypes.containsSample(name) ) { + // only add if the name is new + Genotype newG = g; + + if ( uniquifySamples || alleleMapping.needsRemapping() ) { + final List alleles = alleleMapping.needsRemapping() ? alleleMapping.remap(g.getAlleles()) : g.getAlleles(); + newG = new GenotypeBuilder(g).name(name).alleles(alleles).make(); + } + + mergedGenotypes.add(newG); + } + } + } + + /** + * Replaces any alleles in the list with NO CALLS, except for the generic ALT allele + * + * @param alleles list of alleles to replace + * @return non-null list of alleles + */ + private static List replaceWithNoCalls(final List alleles) { + if ( alleles == null ) throw new IllegalArgumentException("list of alleles cannot be null"); + + final List result = new ArrayList<>(alleles.size()); + for ( final Allele allele : alleles ) + result.add(allele.equals(NON_REF_SYMBOLIC_ALLELE) ? allele : Allele.NO_CALL); + return result; + } + + /** + * Merge into the context a new genotype represented by the given VariantContext for the provided list of target alleles. + * This method assumes that none of the alleles in the VC overlaps with any of the alleles in the set. + * + * @param mergedGenotypes the genotypes context to add to + * @param VC the Variant Context for the sample + * @param remappedAlleles the list of remapped alleles for the sample + * @param targetAlleles the list of target alleles + */ + private static void mergeRefConfidenceGenotypes(final GenotypesContext mergedGenotypes, + final VariantContext VC, + final List remappedAlleles, + final List targetAlleles) { + for ( final Genotype g : VC.getGenotypes() ) { + // only add if the name is new + final String name = g.getSampleName(); + if ( !mergedGenotypes.containsSample(name) ) { + + if ( !g.hasPL() ) { + if ( g.isNoCall() ) { + mergedGenotypes.add(g); + continue; + } + throw new UserException("cannot merge genotypes from samples without PLs; sample " + g.getSampleName() + " does not have likelihoods at position " + VC.getChr() + ":" + VC.getStart()); + } + + // we need to modify it even if it already contains all of the alleles because we need to purge the PLs out anyways + final int[] indexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles, VC.getStart()); + final int[] PLs = generatePLs(g, indexesOfRelevantAlleles); + final int[] AD = g.hasAD() ? generateAD(g.getAD(), indexesOfRelevantAlleles) : null; + + final Genotype newG = new GenotypeBuilder(g).name(name).alleles(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)).PL(PLs).AD(AD).noGQ().make(); + mergedGenotypes.add(newG); + } + } + } + + /** + * Determines the allele mapping from myAlleles to the targetAlleles, substituting the generic "" as appropriate. + * If the myAlleles set does not contain "" as an allele, it throws an exception. + * + * @param remappedAlleles the list of alleles to evaluate + * @param targetAlleles the target list of alleles + * @param position position to use for error messages + * @return non-null array of ints representing indexes + */ + protected static int[] getIndexesOfRelevantAlleles(final List remappedAlleles, final List targetAlleles, final int position) { + + if ( remappedAlleles == null || remappedAlleles.size() == 0 ) throw new IllegalArgumentException("The list of input alleles must not be null or empty"); + if ( targetAlleles == null || targetAlleles.size() == 0 ) throw new IllegalArgumentException("The list of target alleles must not be null or empty"); + + if ( !remappedAlleles.contains(NON_REF_SYMBOLIC_ALLELE) ) + throw new UserException("The list of input alleles must contain " + NON_REF_SYMBOLIC_ALLELE + " as an allele but that is not the case at position " + position + "; please use the Haplotype Caller with gVCF output to generate appropriate records"); + final int indexOfGenericAlt = remappedAlleles.indexOf(NON_REF_SYMBOLIC_ALLELE); + + final int[] indexMapping = new int[targetAlleles.size()]; + + // the reference alleles always match up (even if they don't appear to) + indexMapping[0] = 0; + + // create the index mapping, using the allele whenever such a mapping doesn't exist + for ( int i = 1; i < targetAlleles.size(); i++ ) { + final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAlleles.get(i)); + indexMapping[i] = indexOfRemappedAllele == -1 ? indexOfGenericAlt: indexOfRemappedAllele; + } + + return indexMapping; + } + + /** + * Generates new PLs given the set of indexes of the Genotype's current alleles from the original PLs. + * Throws an exception if the Genotype does not contain PLs. + * + * @param genotype the genotype from which to grab PLs + * @param indexesOfRelevantAlleles the indexes of the original alleles corresponding to the new alleles + * @return non-null array of new PLs + */ + protected static int[] generatePLs(final Genotype genotype, final int[] indexesOfRelevantAlleles) { + if ( !genotype.hasPL() ) + throw new IllegalArgumentException("Cannot generate new PLs from a genotype without PLs"); + + final int[] originalPLs = genotype.getPL(); + + // assume diploid + final int numLikelihoods = GenotypeLikelihoods.numLikelihoods(indexesOfRelevantAlleles.length, 2); + final int[] newPLs = new int[numLikelihoods]; + + for ( int i = 0; i < indexesOfRelevantAlleles.length; i++ ) { + for ( int j = i; j < indexesOfRelevantAlleles.length; j++ ) { + final int originalPLindex = calculatePLindexFromUnorderedIndexes(indexesOfRelevantAlleles[i], indexesOfRelevantAlleles[j]); + if ( originalPLindex >= originalPLs.length ) + throw new IllegalStateException("The original PLs do not have enough values; accessing index " + originalPLindex + " but size is " + originalPLs.length); + + final int newPLindex = GenotypeLikelihoods.calculatePLindex(i, j); + newPLs[newPLindex] = originalPLs[originalPLindex]; + } + } + + return newPLs; + } + + /** + * Generates a new AD array by adding zeros for missing alleles given the set of indexes of the Genotype's current + * alleles from the original AD. + * + * @param originalAD the original AD to extend + * @param indexesOfRelevantAlleles the indexes of the original alleles corresponding to the new alleles + * @return non-null array of new AD values + */ + protected static int[] generateAD(final int[] originalAD, final int[] indexesOfRelevantAlleles) { + if ( originalAD == null || indexesOfRelevantAlleles == null ) throw new IllegalArgumentException("The list of input AD values and alleles must not be null"); + + final int numADs = indexesOfRelevantAlleles.length; + if ( numADs == originalAD.length ) + return originalAD; + + final int[] newAD = new int[numADs]; + + for ( int i = 0; i < numADs; i++ ) { + final int oldIndex = indexesOfRelevantAlleles[i]; + if ( oldIndex >= originalAD.length ) + newAD[i] = 0; + else + newAD[i] = originalAD[oldIndex]; + } + + return newAD; + } + + /** + * This is just a safe wrapper around GenotypeLikelihoods.calculatePLindex() + * + * @param originalIndex1 the index of the first allele + * @param originalIndex2 the index of the second allele + * @return the PL index + */ + protected static int calculatePLindexFromUnorderedIndexes(final int originalIndex1, final int originalIndex2) { + // we need to make sure they are ordered correctly + return ( originalIndex2 < originalIndex1 ) ? GenotypeLikelihoods.calculatePLindex(originalIndex2, originalIndex1) : GenotypeLikelihoods.calculatePLindex(originalIndex1, originalIndex2); + } + + public static String mergedSampleName(String trackName, String sampleName, boolean uniquify ) { + return uniquify ? sampleName + "." + trackName : sampleName; + } + + /** + * Trim the alleles in inputVC from the reverse direction + * + * @param inputVC a non-null input VC whose alleles might need a haircut + * @return a non-null VariantContext (may be == to inputVC) with alleles trimmed up + */ + public static VariantContext reverseTrimAlleles( final VariantContext inputVC ) { + return trimAlleles(inputVC, false, true); + } + + /** + * Trim the alleles in inputVC from the forward direction + * + * @param inputVC a non-null input VC whose alleles might need a haircut + * @return a non-null VariantContext (may be == to inputVC) with alleles trimmed up + */ + public static VariantContext forwardTrimAlleles( final VariantContext inputVC ) { + return trimAlleles(inputVC, true, false); + } + + /** + * Trim the alleles in inputVC forward and reverse, as requested + * + * @param inputVC a non-null input VC whose alleles might need a haircut + * @param trimForward should we trim up the alleles from the forward direction? + * @param trimReverse should we trim up the alleles from the reverse direction? + * @return a non-null VariantContext (may be == to inputVC) with trimmed up alleles + */ + @Ensures("result != null") + public static VariantContext trimAlleles(final VariantContext inputVC, final boolean trimForward, final boolean trimReverse) { + if ( inputVC == null ) throw new IllegalArgumentException("inputVC cannot be null"); + + if ( inputVC.getNAlleles() <= 1 || inputVC.isSNP() ) + return inputVC; + + // see whether we need to trim common reference base from all alleles + final int revTrim = trimReverse ? computeReverseClipping(inputVC.getAlleles(), inputVC.getReference().getDisplayString().getBytes()) : 0; + final VariantContext revTrimVC = trimAlleles(inputVC, -1, revTrim); + final int fwdTrim = trimForward ? computeForwardClipping(revTrimVC.getAlleles()) : -1; + final VariantContext vc= trimAlleles(revTrimVC, fwdTrim, 0); + return vc; + } + + /** + * Trim up alleles in inputVC, cutting out all bases up to fwdTrimEnd inclusive and + * the last revTrim bases from the end + * + * @param inputVC a non-null input VC + * @param fwdTrimEnd bases up to this index (can be -1) will be removed from the start of all alleles + * @param revTrim the last revTrim bases of each allele will be clipped off as well + * @return a non-null VariantContext (may be == to inputVC) with trimmed up alleles + */ + @Requires({"inputVC != null"}) + @Ensures("result != null") + protected static VariantContext trimAlleles(final VariantContext inputVC, + final int fwdTrimEnd, + final int revTrim) { + if( fwdTrimEnd == -1 && revTrim == 0 ) // nothing to do, so just return inputVC unmodified + return inputVC; + + final List alleles = new LinkedList<>(); + final Map originalToTrimmedAlleleMap = new HashMap<>(); + + for (final Allele a : inputVC.getAlleles()) { + if (a.isSymbolic()) { + alleles.add(a); + originalToTrimmedAlleleMap.put(a, a); + } else { + // get bases for current allele and create a new one with trimmed bases + final byte[] newBases = Arrays.copyOfRange(a.getBases(), fwdTrimEnd+1, a.length()-revTrim); + final Allele trimmedAllele = Allele.create(newBases, a.isReference()); + alleles.add(trimmedAllele); + originalToTrimmedAlleleMap.put(a, trimmedAllele); + } + } + + // now we can recreate new genotypes with trimmed alleles + final AlleleMapper alleleMapper = new AlleleMapper(originalToTrimmedAlleleMap); + final GenotypesContext genotypes = updateGenotypesWithMappedAlleles(inputVC.getGenotypes(), alleleMapper); + + final int start = inputVC.getStart() + (fwdTrimEnd + 1); + final VariantContextBuilder builder = new VariantContextBuilder(inputVC); + builder.start(start); + builder.stop(start + alleles.get(0).length() - 1); + builder.alleles(alleles); + builder.genotypes(genotypes); + return builder.make(); + } + + @Requires("originalGenotypes != null && alleleMapper != null") + protected static GenotypesContext updateGenotypesWithMappedAlleles(final GenotypesContext originalGenotypes, final AlleleMapper alleleMapper) { + final GenotypesContext updatedGenotypes = GenotypesContext.create(originalGenotypes.size()); + + for ( final Genotype genotype : originalGenotypes ) { + final List updatedAlleles = alleleMapper.remap(genotype.getAlleles()); + updatedGenotypes.add(new GenotypeBuilder(genotype).alleles(updatedAlleles).make()); + } + + return updatedGenotypes; + } + + public static int computeReverseClipping(final List unclippedAlleles, final byte[] ref) { + int clipping = 0; + boolean stillClipping = true; + + while ( stillClipping ) { + for ( final Allele a : unclippedAlleles ) { + if ( a.isSymbolic() ) + continue; + + // we need to ensure that we don't reverse clip out all of the bases from an allele because we then will have the wrong + // position set for the VariantContext (although it's okay to forward clip it all out, because the position will be fine). + if ( a.length() - clipping == 0 ) + return clipping - 1; + + if ( a.length() - clipping <= 0 || a.length() == 0 ) { + stillClipping = false; + } + else if ( ref.length == clipping ) { + return -1; + } + else if ( a.getBases()[a.length()-clipping-1] != ref[ref.length-clipping-1] ) { + stillClipping = false; + } + } + if ( stillClipping ) + clipping++; + } + + return clipping; + } + + /** + * Clip out any unnecessary bases off the front of the alleles + * + * The VCF spec represents alleles as block substitutions, replacing AC with A for a + * 1 bp deletion of the C. However, it's possible that we'd end up with alleles that + * contain extra bases on the left, such as GAC/GA to represent the same 1 bp deletion. + * This routine finds an offset among all alleles that can be safely trimmed + * off the left of each allele and still represent the same block substitution. + * + * A/C => A/C + * AC/A => AC/A + * ACC/AC => CC/C + * AGT/CAT => AGT/CAT + * /C => /C + * + * @param unclippedAlleles a non-null list of alleles that we want to clip + * @return the offset into the alleles where we can safely clip, inclusive, or + * -1 if no clipping is tolerated. So, if the result is 0, then we can remove + * the first base of every allele. If the result is 1, we can remove the + * second base. + */ + public static int computeForwardClipping(final List unclippedAlleles) { + // cannot clip unless there's at least 1 alt allele + if ( unclippedAlleles.size() <= 1 ) + return -1; + + // we cannot forward clip any set of alleles containing a symbolic allele + int minAlleleLength = Integer.MAX_VALUE; + for ( final Allele a : unclippedAlleles ) { + if ( a.isSymbolic() ) + return -1; + minAlleleLength = Math.min(minAlleleLength, a.length()); + } + + final byte[] firstAlleleBases = unclippedAlleles.get(0).getBases(); + int indexOflastSharedBase = -1; + + // the -1 to the stop is that we can never clip off the right most base + for ( int i = 0; i < minAlleleLength - 1; i++) { + final byte base = firstAlleleBases[i]; + + for ( final Allele allele : unclippedAlleles ) { + if ( allele.getBases()[i] != base ) + return indexOflastSharedBase; + } + + indexOflastSharedBase = i; + } + + return indexOflastSharedBase; + } + + public static double computeHardyWeinbergPvalue(VariantContext vc) { + if ( vc.getCalledChrCount() == 0 ) + return 0.0; + return HardyWeinbergCalculation.hwCalculate(vc.getHomRefCount(), vc.getHetCount(), vc.getHomVarCount()); + } + + public static boolean requiresPaddingBase(final List alleles) { + + // see whether one of the alleles would be null if trimmed through + + for ( final String allele : alleles ) { + if ( allele.isEmpty() ) + return true; + } + + int clipping = 0; + Character currentBase = null; + + while ( true ) { + for ( final String allele : alleles ) { + if ( allele.length() - clipping == 0 ) + return true; + + char myBase = allele.charAt(clipping); + if ( currentBase == null ) + currentBase = myBase; + else if ( currentBase != myBase ) + return false; + } + + clipping++; + currentBase = null; + } + } + + private final static Map subsetAttributes(final CommonInfo igc, final Collection keysToPreserve) { + Map attributes = new HashMap<>(keysToPreserve.size()); + for ( final String key : keysToPreserve ) { + if ( igc.hasAttribute(key) ) + attributes.put(key, igc.getAttribute(key)); + } + return attributes; + } + + /** + * @deprecated use variant context builder version instead + * @param vc the variant context + * @param keysToPreserve the keys to preserve + * @return a pruned version of the original variant context + */ + @Deprecated + public static VariantContext pruneVariantContext(final VariantContext vc, Collection keysToPreserve ) { + return pruneVariantContext(new VariantContextBuilder(vc), keysToPreserve).make(); + } + + public static VariantContextBuilder pruneVariantContext(final VariantContextBuilder builder, Collection keysToPreserve ) { + final VariantContext vc = builder.make(); + if ( keysToPreserve == null ) keysToPreserve = Collections.emptyList(); + + // VC info + final Map attributes = subsetAttributes(vc.getCommonInfo(), keysToPreserve); + + // Genotypes + final GenotypesContext genotypes = GenotypesContext.create(vc.getNSamples()); + for ( final Genotype g : vc.getGenotypes() ) { + final GenotypeBuilder gb = new GenotypeBuilder(g); + // remove AD, DP, PL, and all extended attributes, keeping just GT and GQ + gb.noAD().noDP().noPL().noAttributes(); + genotypes.add(gb.make()); + } + + return builder.genotypes(genotypes).attributes(attributes); + } + + public static boolean allelesAreSubset(VariantContext vc1, VariantContext vc2) { + // if all alleles of vc1 are a contained in alleles of vc2, return true + if (!vc1.getReference().equals(vc2.getReference())) + return false; + + for (final Allele a :vc1.getAlternateAlleles()) { + if (!vc2.getAlternateAlleles().contains(a)) + return false; + } + + return true; + } + + public static Map> separateVariantContextsByType( final Collection VCs ) { + if( VCs == null ) { throw new IllegalArgumentException("VCs cannot be null."); } + + final HashMap> mappedVCs = new HashMap<>(); + for ( final VariantContext vc : VCs ) { + VariantContext.Type vcType = vc.getType(); + if( vc.hasAllele(NON_REF_SYMBOLIC_ALLELE) ) { + if( vc.getAlternateAlleles().size() > 1 ) { throw new IllegalStateException("Reference records should not have more than one alternate allele"); } + vcType = VariantContext.Type.NO_VARIATION; + } + + // look at previous variant contexts of different type. If: + // a) otherVC has alleles which are subset of vc, remove otherVC from its list and add otherVC to vc's list + // b) vc has alleles which are subset of otherVC. Then, add vc to otherVC's type list (rather, do nothing since vc will be added automatically to its list) + // c) neither: do nothing, just add vc to its own list + boolean addtoOwnList = true; + for (final VariantContext.Type type : VariantContext.Type.values()) { + if (type.equals(vcType)) + continue; + + if (!mappedVCs.containsKey(type)) + continue; + + List vcList = mappedVCs.get(type); + for (int k=0; k < vcList.size(); k++) { + VariantContext otherVC = vcList.get(k); + if (allelesAreSubset(otherVC,vc)) { + // otherVC has a type different than vc and its alleles are a subset of vc: remove otherVC from its list and add it to vc's type list + vcList.remove(k); + // avoid having empty lists + if (vcList.size() == 0) + mappedVCs.remove(type); + if ( !mappedVCs.containsKey(vcType) ) + mappedVCs.put(vcType, new ArrayList()); + mappedVCs.get(vcType).add(otherVC); + break; + } + else if (allelesAreSubset(vc,otherVC)) { + // vc has a type different than otherVC and its alleles are a subset of VC: add vc to otherVC's type list and don't add to its own + mappedVCs.get(type).add(vc); + addtoOwnList = false; + break; + } + } + } + if (addtoOwnList) { + if ( !mappedVCs.containsKey(vcType) ) + mappedVCs.put(vcType, new ArrayList()); + mappedVCs.get(vcType).add(vc); + } + } + + return mappedVCs; + } + + public static VariantContext purgeUnallowedGenotypeAttributes(VariantContext vc, Set allowedAttributes) { + if ( allowedAttributes == null ) + return vc; + + final GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); + for ( final Genotype genotype : vc.getGenotypes() ) { + final Map attrs = new HashMap<>(); + for ( final Map.Entry attr : genotype.getExtendedAttributes().entrySet() ) { + if ( allowedAttributes.contains(attr.getKey()) ) + attrs.put(attr.getKey(), attr.getValue()); + } + newGenotypes.add(new GenotypeBuilder(genotype).attributes(attrs).make()); + } + + return new VariantContextBuilder(vc).genotypes(newGenotypes).make(); + } + + protected static class AlleleMapper { + private VariantContext vc = null; + private Map map = null; + public AlleleMapper(VariantContext vc) { this.vc = vc; } + public AlleleMapper(Map map) { this.map = map; } + public boolean needsRemapping() { return this.map != null; } + public Collection values() { return map != null ? map.values() : vc.getAlleles(); } + public Allele remap(Allele a) { return map != null && map.containsKey(a) ? map.get(a) : a; } + + public List remap(List as) { + List newAs = new ArrayList<>(); + for ( final Allele a : as ) { + //System.out.printf(" Remapping %s => %s%n", a, remap(a)); + newAs.add(remap(a)); + } + return newAs; + } + + /** + * @return the list of unique values + */ + public List getUniqueMappedAlleles() { + if ( map == null ) + return Collections.emptyList(); + return new ArrayList<>(new HashSet<>(map.values())); + } + } + + private static class CompareByPriority implements Comparator, Serializable { + List priorityListOfVCs; + public CompareByPriority(List priorityListOfVCs) { + this.priorityListOfVCs = priorityListOfVCs; + } + + private int getIndex(VariantContext vc) { + int i = priorityListOfVCs.indexOf(vc.getSource()); + if ( i == -1 ) throw new IllegalArgumentException("Priority list " + priorityListOfVCs + " doesn't contain variant context " + vc.getSource()); + return i; + } + + public int compare(VariantContext vc1, VariantContext vc2) { + return Integer.valueOf(getIndex(vc1)).compareTo(getIndex(vc2)); + } + } + + /** + * For testing purposes only. Create a site-only VariantContext at contig:start containing alleles + * + * @param name the name of the VC + * @param contig the contig for the VC + * @param start the start of the VC + * @param alleleStrings a non-null, non-empty list of strings for the alleles. The first will be the ref allele, and others the + * alt. Will compute the stop of the VC from the length of the reference allele + * @return a non-null VariantContext + */ + public static VariantContext makeFromAlleles(final String name, final String contig, final int start, final List alleleStrings) { + if ( alleleStrings == null || alleleStrings.isEmpty() ) + throw new IllegalArgumentException("alleleStrings must be non-empty, non-null list"); + + final List alleles = new LinkedList<>(); + final int length = alleleStrings.get(0).length(); + + boolean first = true; + for ( final String alleleString : alleleStrings ) { + alleles.add(Allele.create(alleleString, first)); + first = false; + } + return new VariantContextBuilder(name, contig, start, start+length-1, alleles).make(); + } + + /** + * Splits the alleles for the provided variant context into its primitive parts. + * Requires that the input VC be bi-allelic, so calling methods should first call splitVariantContextToBiallelics() if needed. + * Currently works only for MNPs. + * + * @param vc the non-null VC to split + * @return a non-empty list of VCs split into primitive parts or the original VC otherwise + */ + public static List splitIntoPrimitiveAlleles(final VariantContext vc) { + if ( vc == null ) + throw new IllegalArgumentException("Trying to break a null Variant Context into primitive parts"); + + if ( !vc.isBiallelic() ) + throw new IllegalArgumentException("Trying to break a multi-allelic Variant Context into primitive parts"); + + // currently only works for MNPs + if ( !vc.isMNP() ) + return Arrays.asList(vc); + + final byte[] ref = vc.getReference().getBases(); + final byte[] alt = vc.getAlternateAllele(0).getBases(); + + if ( ref.length != alt.length ) + throw new IllegalStateException("ref and alt alleles for MNP have different lengths"); + + final List result = new ArrayList<>(ref.length); + + for ( int i = 0; i < ref.length; i++ ) { + + // if the ref and alt bases are different at a given position, create a new SNP record (otherwise do nothing) + if ( ref[i] != alt[i] ) { + + // create the ref and alt SNP alleles + final Allele newRefAllele = Allele.create(ref[i], true); + final Allele newAltAllele = Allele.create(alt[i], false); + + // create a new VariantContext with the new SNP alleles + final VariantContextBuilder newVC = new VariantContextBuilder(vc).start(vc.getStart() + i).stop(vc.getStart() + i).alleles(Arrays.asList(newRefAllele, newAltAllele)); + + // create new genotypes with updated alleles + final Map alleleMap = new HashMap<>(); + alleleMap.put(vc.getReference(), newRefAllele); + alleleMap.put(vc.getAlternateAllele(0), newAltAllele); + final GenotypesContext newGenotypes = updateGenotypesWithMappedAlleles(vc.getGenotypes(), new AlleleMapper(alleleMap)); + + result.add(newVC.genotypes(newGenotypes).make()); + } + } + + if ( result.isEmpty() ) + result.add(vc); + + return result; + } + + /** + * Are vc1 and 2 equal including their position and alleles? + * @param vc1 non-null VariantContext + * @param vc2 non-null VariantContext + * @return true if vc1 and vc2 are equal, false otherwise + */ + public static boolean equalSites(final VariantContext vc1, final VariantContext vc2) { + if ( vc1 == null ) throw new IllegalArgumentException("vc1 cannot be null"); + if ( vc2 == null ) throw new IllegalArgumentException("vc2 cannot be null"); + + if ( vc1.getStart() != vc2.getStart() ) return false; + if ( vc1.getEnd() != vc2.getEnd() ) return false; + if ( ! vc1.getChr().equals(vc2.getChr())) return false; + if ( ! vc1.getAlleles().equals(vc2.getAlleles()) ) return false; + return true; + } + + /** + * Returns the absolute 0-based index of an allele. + * + *

+ * If the allele is equal to the reference, the result is 0, if it equal to the first alternative the result is 1 + * and so forth. + *

+ * Therefore if you want the 0-based index within the alternative alleles you need to do the following: + * + *

+ * You can indicate whether the Java object reference comparator {@code ==} can be safelly used by setting {@code useEquals} to {@code false}. + * + * @param vc the target variant context. + * @param allele the target allele. + * @param ignoreRefState whether the reference states of the allele is important at all. Has no effect if {@code useEquals} is {@code false}. + * @param considerRefAllele whether the reference allele should be considered. You should set it to {@code false} if you are only interested in alternative alleles. + * @param useEquals whether equal method should be used in the search: {@link Allele#equals(Allele,boolean)}. + * + * @throws IllegalArgumentException if {@code allele} is {@code null}. + * @return {@code -1} if there is no such allele that satify those criteria, a value between 0 and {@link VariantContext#getNAlleles()} {@code -1} otherwise. + */ + public static int indexOfAllele(final VariantContext vc, final Allele allele, final boolean ignoreRefState, final boolean considerRefAllele, final boolean useEquals) { + if (allele == null) throw new IllegalArgumentException(); + return useEquals ? indexOfEqualAllele(vc,allele,ignoreRefState,considerRefAllele) : indexOfSameAllele(vc,allele,considerRefAllele); + } + + /** + * Returns the relative 0-based index of an alternative allele. + *

+ * The the query allele is the same as the first alternative allele, the result is 0, + * if it is equal to the second 1 and so forth. + * + * + *

+ * Notice that the ref-status of the query {@code allele} is ignored. + * + * @param vc the target variant context. + * @param allele the query allele. + * @param useEquals whether equal method should be used in the search: {@link Allele#equals(Allele,boolean)}. + * + * @throws IllegalArgumentException if {@code allele} is {@code null}. + * + * @return {@code -1} if there is no such allele that satify those criteria, a value between 0 and the number + * of alternative alleles - 1. + */ + public static int indexOfAltAllele(final VariantContext vc, final Allele allele, final boolean useEquals) { + final int absoluteIndex = indexOfAllele(vc,allele,true,false,useEquals); + return absoluteIndex == -1 ? -1 : absoluteIndex - 1; + } + + // Impements index search using equals. + private static int indexOfEqualAllele(final VariantContext vc, final Allele allele, final boolean ignoreRefState, + final boolean considerRefAllele) { + int i = 0; + for (final Allele a : vc.getAlleles()) + if (a.equals(allele,ignoreRefState)) + return i == 0 ? (considerRefAllele ? 0 : -1) : i; + else + i++; + return -1; + } + + // Implements index search using ==. + private static int indexOfSameAllele(final VariantContext vc, final Allele allele, final boolean considerRefAllele) { + int i = 0; + + for (final Allele a : vc.getAlleles()) + if (a == allele) + return i == 0 ? (considerRefAllele ? 0 : -1) : i; + else + i++; + + return -1; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/wiggle/WiggleHeader.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/wiggle/WiggleHeader.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/wiggle/WiggleHeader.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/wiggle/WiggleHeader.java diff --git a/public/java/src/org/broadinstitute/sting/utils/wiggle/WiggleWriter.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/wiggle/WiggleWriter.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/utils/wiggle/WiggleWriter.java rename to public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/wiggle/WiggleWriter.java diff --git a/public/keys/GATK_public.key b/public/gatk-framework/src/main/resources/GATK_public.key similarity index 100% rename from public/keys/GATK_public.key rename to public/gatk-framework/src/main/resources/GATK_public.key diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/resources/GATK_AWS_access.key b/public/gatk-framework/src/main/resources/org/broadinstitute/sting/gatk/phonehome/resources/GATK_AWS_access.key similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/phonehome/resources/GATK_AWS_access.key rename to public/gatk-framework/src/main/resources/org/broadinstitute/sting/gatk/phonehome/resources/GATK_AWS_access.key diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/resources/GATK_AWS_secret.key b/public/gatk-framework/src/main/resources/org/broadinstitute/sting/gatk/phonehome/resources/GATK_AWS_secret.key similarity index 100% rename from public/java/src/org/broadinstitute/sting/gatk/phonehome/resources/GATK_AWS_secret.key rename to public/gatk-framework/src/main/resources/org/broadinstitute/sting/gatk/phonehome/resources/GATK_AWS_secret.key diff --git a/public/R/scripts/org/broadinstitute/sting/gatk/walkers/variantrecalibration/plot_Tranches.R b/public/gatk-framework/src/main/resources/org/broadinstitute/sting/gatk/walkers/variantrecalibration/plot_Tranches.R similarity index 100% rename from public/R/scripts/org/broadinstitute/sting/gatk/walkers/variantrecalibration/plot_Tranches.R rename to public/gatk-framework/src/main/resources/org/broadinstitute/sting/gatk/walkers/variantrecalibration/plot_Tranches.R diff --git a/public/R/scripts/org/broadinstitute/sting/utils/recalibration/BQSR.R b/public/gatk-framework/src/main/resources/org/broadinstitute/sting/utils/recalibration/BQSR.R similarity index 100% rename from public/R/scripts/org/broadinstitute/sting/utils/recalibration/BQSR.R rename to public/gatk-framework/src/main/resources/org/broadinstitute/sting/utils/recalibration/BQSR.R diff --git a/public/java/test/net/sf/samtools/GATKBAMFileSpanUnitTest.java b/public/gatk-framework/src/test/java/net/sf/samtools/GATKBAMFileSpanUnitTest.java similarity index 100% rename from public/java/test/net/sf/samtools/GATKBAMFileSpanUnitTest.java rename to public/gatk-framework/src/test/java/net/sf/samtools/GATKBAMFileSpanUnitTest.java diff --git a/public/java/test/net/sf/samtools/GATKChunkUnitTest.java b/public/gatk-framework/src/test/java/net/sf/samtools/GATKChunkUnitTest.java similarity index 100% rename from public/java/test/net/sf/samtools/GATKChunkUnitTest.java rename to public/gatk-framework/src/test/java/net/sf/samtools/GATKChunkUnitTest.java diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/BaseTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/BaseTest.java new file mode 100644 index 000000000..e8aed7d50 --- /dev/null +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/BaseTest.java @@ -0,0 +1,527 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting; + +import org.apache.log4j.AppenderSkeleton; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.apache.log4j.PatternLayout; +import org.apache.log4j.spi.LoggingEvent; +import org.broad.tribble.readers.LineIterator; +import org.broad.tribble.readers.PositionalBufferedStream; +import org.broadinstitute.sting.commandline.CommandLineUtils; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.crypt.CryptUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.io.IOUtils; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.variant.bcf2.BCF2Codec; +import org.broadinstitute.variant.variantcontext.Genotype; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.vcf.VCFCodec; +import org.broadinstitute.variant.vcf.VCFConstants; +import org.broadinstitute.variant.vcf.VCFHeader; +import org.broadinstitute.variant.vcf.VCFHeaderLine; +import org.testng.Assert; +import org.testng.Reporter; +import org.testng.SkipException; + +import java.io.File; +import java.io.IOException; +import java.util.*; + +/** + * + * User: aaron + * Date: Apr 14, 2009 + * Time: 10:24:30 AM + * + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + * + */ + + +/** + * @author aaron + * @version 1.0 + * @date Apr 14, 2009 + *

+ * Class BaseTest + *

+ * This is the base test class for all of our test cases. All test cases should extend from this + * class; it sets up the logger, and resolves the location of directories that we rely on. + */ +@SuppressWarnings("unchecked") +public abstract class BaseTest { + /** our log, which we want to capture anything from org.broadinstitute.sting */ + public static final Logger logger = CommandLineUtils.getStingLogger(); + + public static final String hg18Reference = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"; + public static final String hg19Reference = "/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta"; + public static final String b36KGReference = "/humgen/1kg/reference/human_b36_both.fasta"; + //public static final String b37KGReference = "/Users/depristo/Desktop/broadLocal/localData/human_g1k_v37.fasta"; + public static final String b37KGReference = "/humgen/1kg/reference/human_g1k_v37.fasta"; + public static final String b37KGReferenceWithDecoy = "/humgen/gsa-hpprojects/GATK/bundle/current/b37/human_g1k_v37_decoy.fasta"; + public static final String GATKDataLocation = "/humgen/gsa-hpprojects/GATK/data/"; + public static final String validationDataLocation = GATKDataLocation + "Validation_Data/"; + public static final String evaluationDataLocation = GATKDataLocation + "Evaluation_Data/"; + public static final String comparisonDataLocation = GATKDataLocation + "Comparisons/"; + public static final String annotationDataLocation = GATKDataLocation + "Annotations/"; + + public static final String b37GoodBAM = validationDataLocation + "/CEUTrio.HiSeq.b37.chr20.10_11mb.bam"; + public static final String b37GoodNA12878BAM = validationDataLocation + "/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam"; + public static final String b37_NA12878_OMNI = validationDataLocation + "/NA12878.omni.vcf"; + + public static final String dbsnpDataLocation = GATKDataLocation; + public static final String b36dbSNP129 = dbsnpDataLocation + "dbsnp_129_b36.vcf"; + public static final String b37dbSNP129 = dbsnpDataLocation + "dbsnp_129_b37.vcf"; + public static final String b37dbSNP132 = dbsnpDataLocation + "dbsnp_132_b37.vcf"; + public static final String hg18dbSNP132 = dbsnpDataLocation + "dbsnp_132.hg18.vcf"; + + public static final String hapmapDataLocation = comparisonDataLocation + "Validated/HapMap/3.3/"; + public static final String b37hapmapGenotypes = hapmapDataLocation + "genotypes_r27_nr.b37_fwd.vcf"; + + public static final String intervalsLocation = "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/"; + public static final String hg19Intervals = intervalsLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list"; + public static final String hg19Chr20Intervals = GATKDataLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.chr20.interval_list"; + + public static final boolean REQUIRE_NETWORK_CONNECTION = false; + private static final String networkTempDirRoot = "/broad/hptmp/"; + private static final boolean networkTempDirRootExists = new File(networkTempDirRoot).exists(); + private static final File networkTempDirFile; + + private static final String privateTestDirRelative = "private/testdata/"; + public static final String privateTestDir = new File(privateTestDirRelative).getAbsolutePath() + "/"; + protected static final String privateTestDirRoot = privateTestDir.replace(privateTestDirRelative, ""); + + private static final String publicTestDirRelative = "public/testdata/"; + public static final String publicTestDir = new File(publicTestDirRelative).getAbsolutePath() + "/"; + protected static final String publicTestDirRoot = publicTestDir.replace(publicTestDirRelative, ""); + + public static final String keysDataLocation = validationDataLocation + "keys/"; + public static final String gatkKeyFile = CryptUtils.GATK_USER_KEY_DIRECTORY + "gsamembers_broadinstitute.org.key"; + + public static final String exampleFASTA = publicTestDir + "exampleFASTA.fasta"; + + public final static String NA12878_PCRFREE = privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam"; + public final static String NA12878_WEx = privateTestDir + "CEUTrio.HiSeq.WEx.b37_decoy.NA12878.20_10_11mb.bam"; + + public static final boolean pipelineTestRunModeIsSet = System.getProperty("pipeline.run", "").equals("run"); + + /** before the class starts up */ + static { + // setup a basic log configuration + CommandLineUtils.configureConsoleLogging(); + + // setup our log layout + PatternLayout layout = new PatternLayout(); + layout.setConversionPattern("TEST %C{1}.%M - %d{HH:mm:ss,SSS} - %m%n"); + + // now set the layout of all the loggers to our layout + CommandLineUtils.setLayout(logger, layout); + + // Set the Root logger to only output warnings. + logger.setLevel(Level.WARN); + + if (networkTempDirRootExists) { + networkTempDirFile = IOUtils.tempDir("temp.", ".dir", new File(networkTempDirRoot + System.getProperty("user.name"))); + networkTempDirFile.deleteOnExit(); + } else { + networkTempDirFile = null; + } + + + if ( REQUIRE_NETWORK_CONNECTION ) { + // find our file sources + if (!fileExist(hg18Reference) || !fileExist(hg19Reference) || !fileExist(b36KGReference)) { + logger.fatal("We can't locate the reference directories. Aborting!"); + throw new RuntimeException("BaseTest setup failed: unable to locate the reference directories"); + } + } + } + + /** + * Simple generic utility class to creating TestNG data providers: + * + * 1: inherit this class, as in + * + * private class SummarizeDifferenceTest extends TestDataProvider { + * public SummarizeDifferenceTest() { + * super(SummarizeDifferenceTest.class); + * } + * ... + * } + * + * Provide a reference to your class to the TestDataProvider constructor. + * + * 2: Create instances of your subclass. Return from it the call to getTests, providing + * the class type of your test + * + * + * {@literal @}DataProvider(name = "summaries") + * public Object[][] createSummaries() { + * new SummarizeDifferenceTest().addDiff("A", "A").addSummary("A:2"); + * new SummarizeDifferenceTest().addDiff("A", "B").addSummary("A:1", "B:1"); + * return SummarizeDifferenceTest.getTests(SummarizeDifferenceTest.class); + * } + * + * + * This class magically tracks created objects of this + */ + public static class TestDataProvider { + private static final Map> tests = new HashMap<>(); + protected String name; + + /** + * Create a new TestDataProvider instance bound to the class variable C + */ + public TestDataProvider(Class c, String name) { + if ( ! tests.containsKey(c) ) + tests.put(c, new ArrayList<>()); + tests.get(c).add(this); + this.name = name; + } + + public TestDataProvider(Class c) { + this(c, ""); + } + + public void setName(final String name) { + this.name = name; + } + + /** + * Return all of the data providers in the form expected by TestNG of type class C + * @param c + * @return + */ + public static Object[][] getTests(Class c) { + List params2 = new ArrayList(); + for ( Object x : tests.get(c) ) params2.add(new Object[]{x}); + return params2.toArray(new Object[][]{}); + } + + @Override + public String toString() { + return "TestDataProvider("+name+")"; + } + } + + /** + * test if the file exists + * + * @param file name as a string + * @return true if it exists + */ + public static boolean fileExist(String file) { + File temp = new File(file); + return temp.exists(); + } + + /** + * this appender looks for a specific message in the log4j stream. + * It can be used to verify that a specific message was generated to the logging system. + */ + public static class ValidationAppender extends AppenderSkeleton { + + private boolean foundString = false; + private String targetString = ""; + + public ValidationAppender(String target) { + targetString = target; + } + + @Override + protected void append(LoggingEvent loggingEvent) { + if (loggingEvent.getMessage().equals(targetString)) + foundString = true; + } + + public void close() { + // do nothing + } + + public boolean requiresLayout() { + return false; + } + + public boolean foundString() { + return foundString; + } + } + + /** + * Creates a temp file that will be deleted on exit after tests are complete. + * @param name Prefix of the file. + * @param extension Extension to concat to the end of the file. + * @return A file in the temporary directory starting with name, ending with extension, which will be deleted after the program exits. + */ + public static File createTempFile(String name, String extension) { + try { + File file = File.createTempFile(name, extension); + file.deleteOnExit(); + return file; + } catch (IOException ex) { + throw new ReviewedStingException("Cannot create temp file: " + ex.getMessage(), ex); + } + } + + /** + * Creates a temp file that will be deleted on exit after tests are complete. + * @param name Name of the file. + * @return A file in the network temporary directory with name, which will be deleted after the program exits. + * @throws SkipException when the network is not available. + */ + public static File tryCreateNetworkTempFile(String name) { + if (!networkTempDirRootExists) + throw new SkipException("Network temporary directory does not exist: " + networkTempDirRoot); + File file = new File(networkTempDirFile, name); + file.deleteOnExit(); + return file; + } + + /** + * Log this message so that it shows up inline during output as well as in html reports + * + * @param message + */ + public static void log(final String message) { + Reporter.log(message, true); + } + + private static final double DEFAULT_FLOAT_TOLERANCE = 1e-1; + + public static final void assertEqualsDoubleSmart(final Object actual, final Double expected) { + Assert.assertTrue(actual instanceof Double, "Not a double"); + assertEqualsDoubleSmart((double)(Double)actual, (double)expected); + } + + public static final void assertEqualsDoubleSmart(final Object actual, final Double expected, final double tolerance) { + Assert.assertTrue(actual instanceof Double, "Not a double"); + assertEqualsDoubleSmart((double)(Double)actual, (double)expected, tolerance); + } + + public static final void assertEqualsDoubleSmart(final double actual, final double expected) { + assertEqualsDoubleSmart(actual, expected, DEFAULT_FLOAT_TOLERANCE); + } + + public static final void assertEqualsSet(final Set actual, final Set expected, final String info) { + final Set actualSet = new HashSet(actual); + final Set expectedSet = new HashSet(expected); + Assert.assertTrue(actualSet.equals(expectedSet), info); // note this is necessary due to testng bug for set comps + } + + public static void assertEqualsDoubleSmart(final double actual, final double expected, final double tolerance) { + assertEqualsDoubleSmart(actual, expected, tolerance, null); + } + + public static void assertEqualsDoubleSmart(final double actual, final double expected, final double tolerance, final String message) { + if ( Double.isNaN(expected) ) // NaN == NaN => false unfortunately + Assert.assertTrue(Double.isNaN(actual), "expected is nan, actual is not"); + else if ( Double.isInfinite(expected) ) // NaN == NaN => false unfortunately + Assert.assertTrue(Double.isInfinite(actual), "expected is infinite, actual is not"); + else { + final double delta = Math.abs(actual - expected); + final double ratio = Math.abs(actual / expected - 1.0); + Assert.assertTrue(delta < tolerance || ratio < tolerance, "expected = " + expected + " actual = " + actual + + " not within tolerance " + tolerance + + (message == null ? "" : "message: " + message)); + } + } + + public static void assertVariantContextsAreEqual( final VariantContext actual, final VariantContext expected ) { + Assert.assertNotNull(actual, "VariantContext expected not null"); + Assert.assertEquals(actual.getChr(), expected.getChr(), "chr"); + Assert.assertEquals(actual.getStart(), expected.getStart(), "start"); + Assert.assertEquals(actual.getEnd(), expected.getEnd(), "end"); + Assert.assertEquals(actual.getID(), expected.getID(), "id"); + Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "alleles for " + expected + " vs " + actual); + + assertAttributesEquals(actual.getAttributes(), expected.getAttributes()); + Assert.assertEquals(actual.filtersWereApplied(), expected.filtersWereApplied(), "filtersWereApplied"); + Assert.assertEquals(actual.isFiltered(), expected.isFiltered(), "isFiltered"); + assertEqualsSet(actual.getFilters(), expected.getFilters(), "filters"); + assertEqualsDoubleSmart(actual.getPhredScaledQual(), expected.getPhredScaledQual()); + + Assert.assertEquals(actual.hasGenotypes(), expected.hasGenotypes(), "hasGenotypes"); + if ( expected.hasGenotypes() ) { + assertEqualsSet(actual.getSampleNames(), expected.getSampleNames(), "sample names set"); + Assert.assertEquals(actual.getSampleNamesOrderedByName(), expected.getSampleNamesOrderedByName(), "sample names"); + final Set samples = expected.getSampleNames(); + for ( final String sample : samples ) { + assertGenotypesAreEqual(actual.getGenotype(sample), expected.getGenotype(sample)); + } + } + } + + public static void assertVariantContextStreamsAreEqual(final Iterable actual, final Iterable expected) { + final Iterator actualIT = actual.iterator(); + final Iterator expectedIT = expected.iterator(); + + while ( expectedIT.hasNext() ) { + final VariantContext expectedVC = expectedIT.next(); + if ( expectedVC == null ) + continue; + + VariantContext actualVC; + do { + Assert.assertTrue(actualIT.hasNext(), "Too few records found in actual"); + actualVC = actualIT.next(); + } while ( actualIT.hasNext() && actualVC == null ); + + if ( actualVC == null ) + Assert.fail("Too few records in actual"); + + assertVariantContextsAreEqual(actualVC, expectedVC); + } + Assert.assertTrue(! actualIT.hasNext(), "Too many records found in actual"); + } + + + public static void assertGenotypesAreEqual(final Genotype actual, final Genotype expected) { + Assert.assertEquals(actual.getSampleName(), expected.getSampleName(), "Genotype names"); + Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "Genotype alleles"); + Assert.assertEquals(actual.getGenotypeString(), expected.getGenotypeString(), "Genotype string"); + Assert.assertEquals(actual.getType(), expected.getType(), "Genotype type"); + + // filters are the same + Assert.assertEquals(actual.getFilters(), expected.getFilters(), "Genotype fields"); + Assert.assertEquals(actual.isFiltered(), expected.isFiltered(), "Genotype isFiltered"); + + // inline attributes + Assert.assertEquals(actual.getDP(), expected.getDP(), "Genotype dp"); + Assert.assertTrue(Arrays.equals(actual.getAD(), expected.getAD())); + Assert.assertEquals(actual.getGQ(), expected.getGQ(), "Genotype gq"); + Assert.assertEquals(actual.hasPL(), expected.hasPL(), "Genotype hasPL"); + Assert.assertEquals(actual.hasAD(), expected.hasAD(), "Genotype hasAD"); + Assert.assertEquals(actual.hasGQ(), expected.hasGQ(), "Genotype hasGQ"); + Assert.assertEquals(actual.hasDP(), expected.hasDP(), "Genotype hasDP"); + + Assert.assertEquals(actual.hasLikelihoods(), expected.hasLikelihoods(), "Genotype haslikelihoods"); + Assert.assertEquals(actual.getLikelihoodsString(), expected.getLikelihoodsString(), "Genotype getlikelihoodsString"); + Assert.assertEquals(actual.getLikelihoods(), expected.getLikelihoods(), "Genotype getLikelihoods"); + Assert.assertTrue(Arrays.equals(actual.getPL(), expected.getPL())); + + Assert.assertEquals(actual.getPhredScaledQual(), expected.getPhredScaledQual(), "Genotype phredScaledQual"); + assertAttributesEquals(actual.getExtendedAttributes(), expected.getExtendedAttributes()); + Assert.assertEquals(actual.isPhased(), expected.isPhased(), "Genotype isPhased"); + Assert.assertEquals(actual.getPloidy(), expected.getPloidy(), "Genotype getPloidy"); + } + + public static void assertVCFHeadersAreEqual(final VCFHeader actual, final VCFHeader expected) { + Assert.assertEquals(actual.getMetaDataInSortedOrder().size(), expected.getMetaDataInSortedOrder().size(), "No VCF header lines"); + + // for some reason set.equals() is returning false but all paired elements are .equals(). Perhaps compare to is busted? + //Assert.assertEquals(actual.getMetaDataInInputOrder(), expected.getMetaDataInInputOrder()); + final List actualLines = new ArrayList(actual.getMetaDataInSortedOrder()); + final List expectedLines = new ArrayList(expected.getMetaDataInSortedOrder()); + for ( int i = 0; i < actualLines.size(); i++ ) { + Assert.assertEquals(actualLines.get(i), expectedLines.get(i), "VCF header lines"); + } + } + + public static void assertVCFandBCFFilesAreTheSame(final File vcfFile, final File bcfFile) throws IOException { + final Pair> vcfData = GATKVCFUtils.readAllVCs(vcfFile, new VCFCodec()); + final Pair> bcfData = GATKVCFUtils.readAllVCs(bcfFile, new BCF2Codec()); + assertVCFHeadersAreEqual(bcfData.getFirst(), vcfData.getFirst()); + assertVariantContextStreamsAreEqual(bcfData.getSecond(), vcfData.getSecond()); + } + + private static void assertAttributeEquals(final String key, final Object actual, final Object expected) { + if ( expected instanceof Double ) { + // must be very tolerant because doubles are being rounded to 2 sig figs + assertEqualsDoubleSmart(actual, (Double) expected, 1e-2); + } else + Assert.assertEquals(actual, expected, "Attribute " + key); + } + + private static void assertAttributesEquals(final Map actual, Map expected) { + final Set expectedKeys = new HashSet(expected.keySet()); + + for ( final Map.Entry act : actual.entrySet() ) { + final Object actualValue = act.getValue(); + if ( expected.containsKey(act.getKey()) && expected.get(act.getKey()) != null ) { + final Object expectedValue = expected.get(act.getKey()); + if ( expectedValue instanceof List ) { + final List expectedList = (List)expectedValue; + Assert.assertTrue(actualValue instanceof List, act.getKey() + " should be a list but isn't"); + final List actualList = (List)actualValue; + Assert.assertEquals(actualList.size(), expectedList.size(), act.getKey() + " size"); + for ( int i = 0; i < expectedList.size(); i++ ) + assertAttributeEquals(act.getKey(), actualList.get(i), expectedList.get(i)); + } else + assertAttributeEquals(act.getKey(), actualValue, expectedValue); + } else { + // it's ok to have a binding in x -> null that's absent in y + Assert.assertNull(actualValue, act.getKey() + " present in one but not in the other"); + } + expectedKeys.remove(act.getKey()); + } + + // now expectedKeys contains only the keys found in expected but not in actual, + // and they must all be null + for ( final String missingExpected : expectedKeys ) { + final Object value = expected.get(missingExpected); + Assert.assertTrue(isMissing(value), "Attribute " + missingExpected + " missing in one but not in other" ); + } + } + + private static final boolean isMissing(final Object value) { + if ( value == null ) return true; + else if ( value.equals(VCFConstants.MISSING_VALUE_v4) ) return true; + else if ( value instanceof List ) { + // handles the case where all elements are null or the list is empty + for ( final Object elt : (List)value) + if ( elt != null ) + return false; + return true; + } else + return false; + } + + /** + * Checks whether two double array contain the same values or not. + * @param actual actual produced array. + * @param expected expected array. + * @param tolerance maximum difference between double value to be consider equivalent. + */ + protected static void assertEqualsDoubleArray(final double[] actual, final double[] expected, final double tolerance) { + if (expected == null) + Assert.assertNull(actual); + else { + Assert.assertNotNull(actual); + Assert.assertEquals(actual.length,expected.length,"array length"); + } + for (int i = 0; i < actual.length; i++) + Assert.assertEquals(actual[i],expected[i],tolerance,"array position " + i); + } +} diff --git a/public/java/test/org/broadinstitute/sting/ExampleToCopyUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/ExampleToCopyUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/ExampleToCopyUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/ExampleToCopyUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/MD5DB.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/MD5DB.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/MD5DB.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/MD5DB.java diff --git a/public/java/test/org/broadinstitute/sting/MD5Mismatch.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/MD5Mismatch.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/MD5Mismatch.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/MD5Mismatch.java diff --git a/public/java/test/org/broadinstitute/sting/StingTextReporter.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/StingTextReporter.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/StingTextReporter.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/StingTextReporter.java diff --git a/public/java/test/org/broadinstitute/sting/TestNGTestTransformer.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/TestNGTestTransformer.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/TestNGTestTransformer.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/TestNGTestTransformer.java diff --git a/public/java/test/org/broadinstitute/sting/WalkerTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/WalkerTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/WalkerTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/WalkerTest.java diff --git a/public/java/test/org/broadinstitute/sting/commandline/ArgumentMatchSiteUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/commandline/ArgumentMatchSiteUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/commandline/ArgumentMatchSiteUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/commandline/ArgumentMatchSiteUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/commandline/ArgumentMatchSourceUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/commandline/ArgumentMatchSourceUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/commandline/ArgumentMatchSourceUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/commandline/ArgumentMatchSourceUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/commandline/ArgumentTypeDescriptorUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/commandline/ArgumentTypeDescriptorUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/commandline/ArgumentTypeDescriptorUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/commandline/ArgumentTypeDescriptorUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/commandline/InvalidArgumentIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/commandline/InvalidArgumentIntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/commandline/InvalidArgumentIntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/commandline/InvalidArgumentIntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/commandline/ParsingEngineUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/commandline/ParsingEngineUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/commandline/ParsingEngineUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/commandline/ParsingEngineUnitTest.java diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/commandline/RodBindingCollectionUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/commandline/RodBindingCollectionUnitTest.java new file mode 100644 index 000000000..29d38ec19 --- /dev/null +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/commandline/RodBindingCollectionUnitTest.java @@ -0,0 +1,126 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.commandline; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.util.Collection; + +public class RodBindingCollectionUnitTest extends BaseTest { + + private ParsingEngine parsingEngine; + private Tags mytags; + + private static final String defaultTagString = "VCF"; + private static final String testVCFFileName = privateTestDir + "empty.vcf"; + private static final String testListFileName = privateTestDir + "oneVCF.list"; + + @BeforeMethod + public void setUp() { + parsingEngine = new ParsingEngine(null); + RodBinding.resetNameCounter(); + mytags = new Tags(); + mytags.addPositionalTag(defaultTagString); + } + + private class RodBindingCollectionArgProvider { + @Argument(fullName="input",doc="input",shortName="V") + public RodBindingCollection input; + } + + @Test + public void testStandardVCF() { + final String[] commandLine = new String[] {"-V", testVCFFileName}; + + parsingEngine.addArgumentSource( RodBindingCollectionArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + final RodBindingCollectionArgProvider argProvider = new RodBindingCollectionArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertEquals(argProvider.input.getRodBindings().iterator().next().getSource(), testVCFFileName, "Argument is not correctly initialized"); + } + + @Test + public void testList() { + final String[] commandLine = new String[] {"-V", testListFileName}; + + parsingEngine.addArgumentSource(RodBindingCollectionArgProvider.class); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + final RodBindingCollectionArgProvider argProvider = new RodBindingCollectionArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertEquals(argProvider.input.getRodBindings().iterator().next().getSource(), "private/testdata/empty.vcf", "Argument is not correctly initialized"); + } + + @Test + public void testDefaultTagsInFile() throws IOException { + + final File testFile = File.createTempFile("RodBindingCollectionUnitTest.defaultTags", ".list"); + testFile.deleteOnExit(); + final FileWriter writer = new FileWriter(testFile); + writer.write(testVCFFileName, 0, testVCFFileName.length()); + writer.close(); + + ArgumentTypeDescriptor.getRodBindingsCollection(testFile, parsingEngine, VariantContext.class, "foo", mytags, "input"); + + final Collection bindings = parsingEngine.getRodBindings(); + Assert.assertNotNull(bindings); + Assert.assertEquals(bindings.size(), 1); + + final RodBinding binding = bindings.iterator().next(); + Assert.assertEquals(parsingEngine.getTags(binding), mytags); + } + + @Test + public void testOverrideTagsInFile() throws IOException { + final File testFile = File.createTempFile("RodBindingCollectionUnitTest.overrideTags", ".list"); + testFile.deleteOnExit(); + final FileWriter writer = new FileWriter(testFile); + final String textToWrite = "foo " + testVCFFileName; + writer.write(textToWrite, 0, textToWrite.length()); + writer.close(); + + ArgumentTypeDescriptor.getRodBindingsCollection(testFile, parsingEngine, VariantContext.class, "foo", mytags, "input"); + + final Collection bindings = parsingEngine.getRodBindings(); + Assert.assertNotNull(bindings); + Assert.assertEquals(bindings.size(), 1); + + final RodBinding binding = bindings.iterator().next(); + Assert.assertNotEquals(parsingEngine.getTags(binding), mytags); + } +} diff --git a/public/java/test/org/broadinstitute/sting/commandline/RodBindingUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/commandline/RodBindingUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/commandline/RodBindingUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/commandline/RodBindingUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/CommandLineGATKUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/CommandLineGATKUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/CommandLineGATKUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/CommandLineGATKUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java new file mode 100644 index 000000000..b10043340 --- /dev/null +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java @@ -0,0 +1,267 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.commandline.ArgumentException; +import org.broadinstitute.sting.commandline.Tags; +import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; +import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.gatk.walkers.qc.CountReads; +import org.broadinstitute.sting.gatk.walkers.readutils.PrintReads; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.*; + +/** + * Tests selected functionality in the GenomeAnalysisEngine class + */ +public class GenomeAnalysisEngineUnitTest extends BaseTest { + + @Test(expectedExceptions=UserException.class) + public void testEmptySamFileListHandling() throws Exception { + GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); + testEngine.setWalker(new CountReads()); //generalizable to any walker requiring reads + + //supply command line args so validateSuppliedReads() knows whether reads were passed in + GATKArgumentCollection testArgs = new GATKArgumentCollection(); + testArgs.samFiles.add("empty.list"); + testEngine.setArguments(testArgs); + + //represents the empty list of samFiles read in from empty.list by CommandLineExecutable + Collection samFiles = new ArrayList(); + + testEngine.setSAMFileIDs(samFiles); + testEngine.validateSuppliedReads(); + } + + @Test(expectedExceptions=UserException.class) + public void testDuplicateSamFileHandlingSingleDuplicate() throws Exception { + GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); + + Collection samFiles = new ArrayList(); + samFiles.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags())); + samFiles.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags())); + + testEngine.setSAMFileIDs(samFiles); + testEngine.checkForDuplicateSamFiles(); + } + + @Test(expectedExceptions=UserException.class) + public void testDuplicateSamFileHandlingMultipleDuplicates() throws Exception { + GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); + + Collection samFiles = new ArrayList(); + samFiles.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags())); + samFiles.add(new SAMReaderID(new File("public/testdata/exampleNORG.bam"), new Tags())); + samFiles.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags())); + samFiles.add(new SAMReaderID(new File("public/testdata/exampleNORG.bam"), new Tags())); + + testEngine.setSAMFileIDs(samFiles); + testEngine.checkForDuplicateSamFiles(); + } + + @Test(expectedExceptions=UserException.class) + public void testDuplicateSamFileHandlingAbsoluteVsRelativePath() { + GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); + + final File relativePathToBAMFile = new File("public/testdata/exampleBAM.bam"); + final File absolutePathToBAMFile = new File(relativePathToBAMFile.getAbsolutePath()); + Collection samFiles = new ArrayList(); + samFiles.add(new SAMReaderID(relativePathToBAMFile, new Tags())); + samFiles.add(new SAMReaderID(absolutePathToBAMFile, new Tags())); + + testEngine.setSAMFileIDs(samFiles); + testEngine.checkForDuplicateSamFiles(); + } + + @Test + public void testEmptyIntervalSetHandling() throws Exception { + GenomeLocParser genomeLocParser = new GenomeLocParser(ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000).getSequenceDictionary()); + + GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); + + testEngine.setWalker(new PrintReads()); + testEngine.setIntervals(new GenomeLocSortedSet(genomeLocParser)); + + testEngine.validateSuppliedIntervals(); + } + + @Test + public void testLoadWellFormedSampleRenameMapFile() throws IOException { + final File mapFile = createTestSampleRenameMapFile(Arrays.asList("/foo/bar/first.bam newSample1", + "/foo/bar/second.bam newSample2", + "/foo/bar2/third.bam newSample3")); + final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + final Map renameMap = engine.loadSampleRenameMap(mapFile); + + Assert.assertEquals(renameMap.size(), 3, "Sample rename map was wrong size after loading from file"); + + final Iterator expectedResultsIterator = Arrays.asList("/foo/bar/first.bam", "newSample1", "/foo/bar/second.bam", "newSample2", "/foo/bar2/third.bam", "newSample3").iterator(); + while ( expectedResultsIterator.hasNext() ) { + final String expectedKey = expectedResultsIterator.next(); + final String expectedValue = expectedResultsIterator.next(); + + Assert.assertNotNull(renameMap.get(new SAMReaderID(expectedKey, new Tags())), String.format("Entry for %s not found in sample rename map", expectedKey)); + Assert.assertEquals(renameMap.get(new SAMReaderID(expectedKey, new Tags())), expectedValue, "Wrong value in sample rename map for " + expectedKey); + } + } + + @DataProvider(name = "MalformedSampleRenameMapFileDataProvider") + public Object[][] generateMalformedSampleRenameMapFiles() throws IOException { + final List tests = new ArrayList(); + + tests.add(new Object[]{"testLoadSampleRenameMapFileNonExistentFile", + new File("/foo/bar/nonexistent")}); + tests.add(new Object[]{"testLoadSampleRenameMapFileMalformedLine1", + createTestSampleRenameMapFile(Arrays.asList("/path/to/foo.bam"))}); + tests.add(new Object[]{"testLoadSampleRenameMapFileMalformedLine2", + createTestSampleRenameMapFile(Arrays.asList("/path/to/foo.bam newSample extraField"))}); + tests.add(new Object[]{"testLoadSampleRenameMapFileNonAbsoluteBamPath", + createTestSampleRenameMapFile(Arrays.asList("relative/path/to/foo.bam newSample"))}); + tests.add(new Object[]{"testLoadSampleRenameMapFileDuplicateBamPath", + createTestSampleRenameMapFile(Arrays.asList("/path/to/dupe.bam newSample1", + "/path/to/dupe.bam newSample2"))}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "MalformedSampleRenameMapFileDataProvider", expectedExceptions = UserException.class) + public void testLoadMalformedSampleRenameMapFile( final String testName, final File mapFile ) { + logger.info("Executing test " + testName); + + final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + final Map renameMap = engine.loadSampleRenameMap(mapFile); + } + + private File createTestSampleRenameMapFile( final List contents ) throws IOException { + final File mapFile = createTempFile("TestSampleRenameMapFile", ".tmp"); + final PrintWriter writer = new PrintWriter(mapFile); + + for ( final String line : contents ) { + writer.println(line); + } + writer.close(); + + return mapFile; + } + + /////////////////////////////////////////////////// + // Test the ReadTransformer ordering enforcement // + /////////////////////////////////////////////////// + + public static class TestReadTransformer extends ReadTransformer { + + private OrderingConstraint orderingConstraint = OrderingConstraint.DO_NOT_CARE; + private boolean enabled; + + protected TestReadTransformer(final OrderingConstraint orderingConstraint) { + this.orderingConstraint = orderingConstraint; + enabled = true; + } + + // need this because PackageUtils will pick up this class as a possible ReadTransformer + protected TestReadTransformer() { + enabled = false; + } + + @Override + public OrderingConstraint getOrderingConstraint() { return orderingConstraint; } + + @Override + public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) { return ApplicationTime.HANDLED_IN_WALKER; } + + @Override + public boolean enabled() { return enabled; } + + @Override + public GATKSAMRecord apply(final GATKSAMRecord read) { return read; } + + } + + @DataProvider(name = "ReadTransformerData") + public Object[][] makeReadTransformerData() { + List tests = new ArrayList(); + + for ( final ReadTransformer.OrderingConstraint orderingConstraint1 : ReadTransformer.OrderingConstraint.values() ) { + for ( final ReadTransformer.OrderingConstraint orderingConstraint2 : ReadTransformer.OrderingConstraint.values() ) { + for ( final ReadTransformer.OrderingConstraint orderingConstraint3 : ReadTransformer.OrderingConstraint.values() ) { + tests.add(new Object[]{orderingConstraint1, orderingConstraint2, orderingConstraint3}); + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "ReadTransformerData") + public void testReadTransformer(final ReadTransformer.OrderingConstraint oc1, final ReadTransformer.OrderingConstraint oc2, final ReadTransformer.OrderingConstraint oc3) { + + final GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); + final List readTransformers = new ArrayList(3); + readTransformers.add(new TestReadTransformer(oc1)); + readTransformers.add(new TestReadTransformer(oc2)); + readTransformers.add(new TestReadTransformer(oc3)); + + final boolean shouldThrowException = numWithConstraint(ReadTransformer.OrderingConstraint.MUST_BE_FIRST, oc1, oc2, oc3) > 1 || + numWithConstraint(ReadTransformer.OrderingConstraint.MUST_BE_LAST, oc1, oc2, oc3) > 1; + + try { + testEngine.setReadTransformers(readTransformers); + + Assert.assertFalse(shouldThrowException); + Assert.assertEquals(testEngine.getReadTransformers().size(), 3); + + Assert.assertTrue(testEngine.getReadTransformers().get(1).getOrderingConstraint() != ReadTransformer.OrderingConstraint.MUST_BE_FIRST); + Assert.assertTrue(testEngine.getReadTransformers().get(2).getOrderingConstraint() != ReadTransformer.OrderingConstraint.MUST_BE_FIRST); + Assert.assertTrue(testEngine.getReadTransformers().get(0).getOrderingConstraint() != ReadTransformer.OrderingConstraint.MUST_BE_LAST); + Assert.assertTrue(testEngine.getReadTransformers().get(1).getOrderingConstraint() != ReadTransformer.OrderingConstraint.MUST_BE_LAST); + } catch (UserException.IncompatibleReadFiltersException e) { + Assert.assertTrue(shouldThrowException); + } + } + + private int numWithConstraint(final ReadTransformer.OrderingConstraint target, final ReadTransformer.OrderingConstraint... constraints ) { + int count = 0; + for ( final ReadTransformer.OrderingConstraint constraint : constraints ) { + if ( constraint == target ) + count++; + } + return count; + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/MaxRuntimeIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/MaxRuntimeIntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/MaxRuntimeIntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/MaxRuntimeIntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/ReadMetricsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/ReadMetricsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/ReadMetricsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/ReadMetricsUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/WalkerManagerUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/WalkerManagerUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/WalkerManagerUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/WalkerManagerUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/AllLocusViewUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/providers/AllLocusViewUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/providers/AllLocusViewUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/providers/AllLocusViewUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/CoveredLocusViewUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/providers/CoveredLocusViewUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/providers/CoveredLocusViewUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/providers/CoveredLocusViewUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/IntervalReferenceOrderedViewUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/providers/IntervalReferenceOrderedViewUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/providers/IntervalReferenceOrderedViewUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/providers/IntervalReferenceOrderedViewUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusReferenceViewUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/providers/LocusReferenceViewUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusReferenceViewUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/providers/LocusReferenceViewUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadReferenceViewUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/providers/ReadReferenceViewUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadReferenceViewUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/providers/ReadReferenceViewUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceViewTemplate.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/providers/ReferenceViewTemplate.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceViewTemplate.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/providers/ReferenceViewTemplate.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ShardDataProviderUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/providers/ShardDataProviderUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ShardDataProviderUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/providers/ShardDataProviderUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ActiveRegionShardBalancerUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/ActiveRegionShardBalancerUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ActiveRegionShardBalancerUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/ActiveRegionShardBalancerUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/FilePointerUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/FilePointerUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/reads/FilePointerUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/FilePointerUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKWalkerBenchmark.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/GATKWalkerBenchmark.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKWalkerBenchmark.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/GATKWalkerBenchmark.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIteratorUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIteratorUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIteratorUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIteratorUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/MockLocusShard.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/MockLocusShard.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/reads/MockLocusShard.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/MockLocusShard.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/PicardBaselineBenchmark.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/PicardBaselineBenchmark.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/reads/PicardBaselineBenchmark.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/PicardBaselineBenchmark.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ReadProcessingBenchmark.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/ReadProcessingBenchmark.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ReadProcessingBenchmark.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/ReadProcessingBenchmark.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancerUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancerUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancerUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancerUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderIDUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderIDUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderIDUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderIDUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SeekableBufferedStreamUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/SeekableBufferedStreamUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SeekableBufferedStreamUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/SeekableBufferedStreamUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/TheoreticalMinimaBenchmark.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/TheoreticalMinimaBenchmark.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/reads/TheoreticalMinimaBenchmark.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/TheoreticalMinimaBenchmark.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSourceIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSourceIntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSourceIntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSourceIntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedQueryDataPoolUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedQueryDataPoolUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedQueryDataPoolUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedQueryDataPoolUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/DownsamplingIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/DownsamplingIntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/downsampling/DownsamplingIntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/DownsamplingIntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIteratorUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIteratorUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIteratorUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIteratorUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIteratorUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIteratorUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIteratorUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIteratorUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/executive/ReduceTreeUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/executive/ReduceTreeUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/executive/ReduceTreeUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/executive/ReduceTreeUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/filters/AllowNCigarMalformedReadFilterUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/filters/AllowNCigarMalformedReadFilterUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/filters/AllowNCigarMalformedReadFilterUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/filters/AllowNCigarMalformedReadFilterUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/filters/BadCigarFilterUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/filters/BadCigarFilterUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/filters/BadCigarFilterUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/filters/BadCigarFilterUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/filters/BadReadGroupsIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/filters/BadReadGroupsIntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/filters/BadReadGroupsIntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/filters/BadReadGroupsIntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/filters/MalformedReadFilterUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/filters/MalformedReadFilterUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/filters/MalformedReadFilterUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/filters/MalformedReadFilterUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/filters/ReadFilterTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/filters/ReadFilterTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/filters/ReadFilterTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/filters/ReadFilterTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/filters/ReadGroupBlackListFilterUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/filters/ReadGroupBlackListFilterUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/filters/ReadGroupBlackListFilterUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/filters/ReadGroupBlackListFilterUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/filters/UnsafeMalformedReadFilterUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/filters/UnsafeMalformedReadFilterUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/filters/UnsafeMalformedReadFilterUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/filters/UnsafeMalformedReadFilterUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/BoundedReadIteratorUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/iterators/BoundedReadIteratorUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/iterators/BoundedReadIteratorUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/iterators/BoundedReadIteratorUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/ReadFormattingIteratorUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/iterators/ReadFormattingIteratorUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/iterators/ReadFormattingIteratorUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/iterators/ReadFormattingIteratorUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/StingSAMIteratorAdapterUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/iterators/StingSAMIteratorAdapterUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/iterators/StingSAMIteratorAdapterUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/iterators/StingSAMIteratorAdapterUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/VerifyingSamIteratorUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/iterators/VerifyingSamIteratorUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/iterators/VerifyingSamIteratorUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/iterators/VerifyingSamIteratorUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManagerUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManagerUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManagerUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManagerUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilderUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilderUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilderUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilderUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/utils/CheckableCloseableTribbleIterator.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/refdata/utils/CheckableCloseableTribbleIterator.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/refdata/utils/CheckableCloseableTribbleIterator.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/refdata/utils/CheckableCloseableTribbleIterator.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/utils/FeatureToGATKFeatureIteratorUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/refdata/utils/FeatureToGATKFeatureIteratorUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/refdata/utils/FeatureToGATKFeatureIteratorUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/refdata/utils/FeatureToGATKFeatureIteratorUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/utils/FlashBackIteratorUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/refdata/utils/FlashBackIteratorUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/refdata/utils/FlashBackIteratorUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/refdata/utils/FlashBackIteratorUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/utils/TestFeatureReader.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/refdata/utils/TestFeatureReader.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/refdata/utils/TestFeatureReader.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/refdata/utils/TestFeatureReader.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/utils/TestRMDTrackBuilder.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/refdata/utils/TestRMDTrackBuilder.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/refdata/utils/TestRMDTrackBuilder.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/refdata/utils/TestRMDTrackBuilder.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/DummyActiveRegionWalker.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/traversals/DummyActiveRegionWalker.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/traversals/DummyActiveRegionWalker.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/traversals/DummyActiveRegionWalker.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCacheUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCacheUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCacheUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCacheUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseDuplicatesUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/traversals/TraverseDuplicatesUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseDuplicatesUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/traversals/TraverseDuplicatesUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/BAQIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/BAQIntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/BAQIntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/BAQIntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/CNV/SymbolicAllelesIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/CNV/SymbolicAllelesIntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/CNV/SymbolicAllelesIntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/CNV/SymbolicAllelesIntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUtilUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUtilUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUtilUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUtilUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociIntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociIntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociIntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CompareCallableLociWalkerIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/coverage/CompareCallableLociWalkerIntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CompareCallableLociWalkerIntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/coverage/CompareCallableLociWalkerIntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageB36IntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageB36IntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageB36IntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageB36IntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/qc/CheckPileupIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/qc/CheckPileupIntegrationTest.java new file mode 100644 index 000000000..0971cb90b --- /dev/null +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/qc/CheckPileupIntegrationTest.java @@ -0,0 +1,70 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.walkers.qc; + +import org.testng.annotations.Test; +import org.broadinstitute.sting.WalkerTest; + +import java.util.Collections; + +/** + * Run validating pileup across a set of core data as proof of the integrity of the GATK core. + * + * Tests both types of old-school pileup formats (basic and consensus). + * + * @author mhanna, vdauwera + * @version 0.2 + */ +public class CheckPileupIntegrationTest extends WalkerTest { + /** + * This test runs on a consensus pileup containing 10-column lines for SNPs and 13-column lines for indels + */ + @Test(enabled = true) + public void testEcoliConsensusPileup() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T CheckPileup" + + " -I " + validationDataLocation + "MV1994.selected.bam" + + " -R " + validationDataLocation + "Escherichia_coli_K12_MG1655.fasta" + + " --pileup:SAMPileup "+ validationDataLocation + "MV1994.selected.pileup" + + " -S SILENT -nt 8",0, Collections.emptyList()); + executeTest("testEcoliConsensusPileup",spec); + } + + /** + * This test runs on a basic pileup containing 6-column lines for all variants TODO + */ + @Test + public void testEcoliBasicPileup() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T CheckPileup" + + " -I " + validationDataLocation + "MV1994.selected.bam" + + " -R " + validationDataLocation + "Escherichia_coli_K12_MG1655.fasta" + + " --pileup:SAMPileup "+ validationDataLocation + "MV1994.basic.pileup" + + " -L Escherichia_coli_K12:1-49" + + " -S SILENT -nt 8",0, Collections.emptyList()); + executeTest("testEcoliBasicPileup",spec); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/CountReadsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/qc/CountReadsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/qc/CountReadsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/qc/CountReadsUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/DictionaryConsistencyIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/qc/DictionaryConsistencyIntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/qc/DictionaryConsistencyIntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/qc/DictionaryConsistencyIntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/FlagStatIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/qc/FlagStatIntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/qc/FlagStatIntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/qc/FlagStatIntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/PileupWalkerIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/qc/PileupWalkerIntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/qc/PileupWalkerIntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/qc/PileupWalkerIntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/readutils/ClipReadsWalkersIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/readutils/ClipReadsWalkersIntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/readutils/ClipReadsWalkersIntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/readutils/ClipReadsWalkersIntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/readutils/PrintReadsIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/readutils/PrintReadsIntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/readutils/PrintReadsIntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/readutils/PrintReadsIntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/readutils/PrintReadsLargeScaleTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/readutils/PrintReadsLargeScaleTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/readutils/PrintReadsLargeScaleTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/readutils/PrintReadsLargeScaleTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/readutils/PrintReadsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/readutils/PrintReadsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/readutils/PrintReadsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/readutils/PrintReadsUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/readutils/ReadAdaptorTrimmerIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/readutils/ReadAdaptorTrimmerIntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/readutils/ReadAdaptorTrimmerIntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/readutils/ReadAdaptorTrimmerIntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariantsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariantsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariantsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariantsUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/jna/clibrary/LibCUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/jna/clibrary/LibCUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/jna/clibrary/LibCUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/jna/clibrary/LibCUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionPipelineTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionPipelineTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionPipelineTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionPipelineTest.java diff --git a/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaPipelineTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaPipelineTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaPipelineTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaPipelineTest.java diff --git a/public/java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatPipelineTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatPipelineTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatPipelineTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatPipelineTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/AutoFormattingTimeUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/AutoFormattingTimeUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/AutoFormattingTimeUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/AutoFormattingTimeUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/BaseUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/BaseUtilsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/BaseUtilsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/BaseUtilsUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/BitSetUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/BitSetUtilsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/BitSetUtilsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/BitSetUtilsUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserBenchmark.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/GenomeLocParserBenchmark.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/GenomeLocParserBenchmark.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/GenomeLocParserBenchmark.java diff --git a/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/GenomeLocUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/GenomeLocUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/MRUCachingSAMSequencingDictionaryUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/MRUCachingSAMSequencingDictionaryUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/MRUCachingSAMSequencingDictionaryUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/MRUCachingSAMSequencingDictionaryUnitTest.java diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/MWUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/MWUnitTest.java new file mode 100644 index 000000000..312e4d5b1 --- /dev/null +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/MWUnitTest.java @@ -0,0 +1,131 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.collections.Pair; + +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; +import org.testng.Assert; + +/** + * Created by IntelliJ IDEA. + * User: Ghost + * Date: 3/5/11 + * Time: 2:06 PM + * To change this template use File | Settings | File Templates. + */ +public class MWUnitTest extends BaseTest { + @BeforeClass + public void init() { } + + @Test + private void testMWU() { + logger.warn("Testing MWU"); + MannWhitneyU mwu = new MannWhitneyU(); + mwu.add(0, MannWhitneyU.USet.SET1); + mwu.add(1,MannWhitneyU.USet.SET2); + mwu.add(2,MannWhitneyU.USet.SET2); + mwu.add(3,MannWhitneyU.USet.SET2); + mwu.add(4,MannWhitneyU.USet.SET2); + mwu.add(5,MannWhitneyU.USet.SET2); + mwu.add(6,MannWhitneyU.USet.SET1); + mwu.add(7,MannWhitneyU.USet.SET1); + mwu.add(8,MannWhitneyU.USet.SET1); + mwu.add(9,MannWhitneyU.USet.SET1); + mwu.add(10,MannWhitneyU.USet.SET1); + mwu.add(11,MannWhitneyU.USet.SET2); + Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(), MannWhitneyU.USet.SET1),25L); + Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(),MannWhitneyU.USet.SET2),11L); + + MannWhitneyU mwu2 = new MannWhitneyU(); + MannWhitneyU mwuNoDither = new MannWhitneyU(false); + for ( int dp : new int[]{2,4,5,6,8} ) { + mwu2.add(dp,MannWhitneyU.USet.SET1); + mwuNoDither.add(dp,MannWhitneyU.USet.SET1); + } + + for ( int dp : new int[]{1,3,7,9,10,11,12,13} ) { + mwu2.add(dp,MannWhitneyU.USet.SET2); + mwuNoDither.add(dp,MannWhitneyU.USet.SET2); + } + + MannWhitneyU.ExactMode pm = MannWhitneyU.ExactMode.POINT; + MannWhitneyU.ExactMode cm = MannWhitneyU.ExactMode.CUMULATIVE; + + // tests using the hypothesis that set 2 dominates set 1 (U value = 10) + Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu2.getObservations(),MannWhitneyU.USet.SET1),10L); + Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu2.getObservations(),MannWhitneyU.USet.SET2),30L); + Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwuNoDither.getObservations(),MannWhitneyU.USet.SET1),10L); + Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwuNoDither.getObservations(),MannWhitneyU.USet.SET2),30L); + + Pair sizes = mwu2.getSetSizes(); + + Assert.assertEquals(MannWhitneyU.calculatePUniformApproximation(sizes.first,sizes.second,10L),0.4180519701814064,1e-14); + Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.first,sizes.second,10L,false,pm).second,0.021756021756021756,1e-14); + Assert.assertEquals(MannWhitneyU.calculatePNormalApproximation(sizes.first,sizes.second,10L,false).second,0.06214143703127617,1e-14); + logger.warn("Testing two-sided"); + Assert.assertEquals((double)mwu2.runTwoSidedTest().second,2*0.021756021756021756,1e-8); + + // tests using the hypothesis that set 1 dominates set 2 (U value = 30) -- empirical should be identical, normall approx close, uniform way off + Assert.assertEquals(MannWhitneyU.calculatePNormalApproximation(sizes.second,sizes.first,30L,true).second,2.0*0.08216463976903321,1e-14); + Assert.assertEquals(MannWhitneyU.calculatePUniformApproximation(sizes.second,sizes.first,30L),0.0023473625009559074,1e-14); + Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,30L,false,pm).second,0.021756021756021756,1e-14); // note -- exactly same value as above + Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,29L,false,cm).second,1.0-0.08547008547008,1e-14); // r does a correction, subtracting 1 from U + Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,11L,false,cm).second,0.08547008547008,1e-14); // r does a correction, subtracting 1 from U + Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,11L,false,cm).first,-1.36918910442,1e-2); // apache inversion set to be good only to 1e-2 + Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,29L,false,cm).first,1.36918910442,1e-2); // apache inversion set to be good only to 1e-2 + Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,29L,false,pm).first,1.2558754796642067,1e-8); // PDF should be similar + Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,11L,false,pm).first,-1.2558754796642067,1e-8); // PDF should be similar + Assert.assertEquals(MannWhitneyU.calculatePRecursively(4,5,10L,false,pm).second,0.0952381,1e-5); + Assert.assertEquals(MannWhitneyU.calculatePRecursively(4,5,10L,false,pm).first,0.0,1e-14); + + logger.warn("Set 1"); + Assert.assertEquals((double)mwu2.runOneSidedTest(MannWhitneyU.USet.SET1).second,0.021756021756021756,1e-8); + logger.warn("Set 2"); + Assert.assertEquals((double)mwu2.runOneSidedTest(MannWhitneyU.USet.SET2).second,0.021756021756021756,1e-8); + + MannWhitneyU mwu3 = new MannWhitneyU(); + for ( int dp : new int[]{0,2,4} ) { + mwu3.add(dp,MannWhitneyU.USet.SET1); + } + for ( int dp : new int[]{1,5,6,7,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34} ) { + mwu3.add(dp,MannWhitneyU.USet.SET2); + } + long u = MannWhitneyU.calculateOneSidedU(mwu3.getObservations(),MannWhitneyU.USet.SET1); + //logger.warn(String.format("U is: %d",u)); + Pair nums = mwu3.getSetSizes(); + //logger.warn(String.format("Corrected p is: %.4e",MannWhitneyU.calculatePRecursivelyDoNotCheckValuesEvenThoughItIsSlow(nums.first,nums.second,u))); + //logger.warn(String.format("Counted sequences: %d",MannWhitneyU.countSequences(nums.first, nums.second, u))); + //logger.warn(String.format("Possible sequences: %d", (long) Arithmetic.binomial(nums.first+nums.second,nums.first))); + //logger.warn(String.format("Ratio: %.4e",MannWhitneyU.countSequences(nums.first,nums.second,u)/Arithmetic.binomial(nums.first+nums.second,nums.first))); + Assert.assertEquals(MannWhitneyU.calculatePRecursivelyDoNotCheckValuesEvenThoughItIsSlow(nums.first, nums.second, u), 3.665689149560116E-4, 1e-14); + Assert.assertEquals(MannWhitneyU.calculatePNormalApproximation(nums.first,nums.second,u,false).second,0.0032240865760884696,1e-14); + Assert.assertEquals(MannWhitneyU.calculatePUniformApproximation(nums.first,nums.second,u),0.0026195003025784036,1e-14); + + } +} diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/MathUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/MathUtilsUnitTest.java new file mode 100644 index 000000000..1bcf38d10 --- /dev/null +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/MathUtilsUnitTest.java @@ -0,0 +1,913 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils; + +import cern.jet.random.Normal; +import org.apache.commons.lang.ArrayUtils; +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +/** + * Basic unit test for MathUtils + */ +public class MathUtilsUnitTest extends BaseTest { + + @BeforeClass + public void init() { + } + + /** + * Tests that we get unique values for the valid (non-null-producing) input space for {@link MathUtils#fastGenerateUniqueHashFromThreeIntegers(int, int, int)}. + */ + @Test + public void testGenerateUniqueHashFromThreePositiveIntegers() { + logger.warn("Executing testGenerateUniqueHashFromThreePositiveIntegers"); + + final Set observedLongs = new HashSet<>(); + for (short i = 0; i < Byte.MAX_VALUE; i++) { + for (short j = 0; j < Byte.MAX_VALUE; j++) { + for (short k = 0; k < Byte.MAX_VALUE; k++) { + final Long aLong = MathUtils.fastGenerateUniqueHashFromThreeIntegers(i, j, k); + //System.out.println(String.format("%s, %s, %s: %s", i, j, k, aLong)); + Assert.assertTrue(observedLongs.add(aLong)); + } + } + } + + for (short i = Byte.MAX_VALUE; i <= Short.MAX_VALUE && i > 0; i += 128) { + for (short j = Byte.MAX_VALUE; j <= Short.MAX_VALUE && j > 0; j += 128) { + for (short k = Byte.MAX_VALUE; k <= Short.MAX_VALUE && k > 0; k += 128) { + final Long aLong = MathUtils.fastGenerateUniqueHashFromThreeIntegers(i, j, k); + // System.out.println(String.format("%s, %s, %s: %s", i, j, k, aLong)); + Assert.assertTrue(observedLongs.add(aLong)); + } + } + } + } + + @Test(dataProvider = "log10OneMinusPow10Data") + public void testLog10OneMinusPow10(final double x, final double expected) { + final double actual = MathUtils.log10OneMinusPow10(x); + if (Double.isNaN(expected)) + Assert.assertTrue(Double.isNaN(actual)); + else + Assert.assertEquals(actual,expected,1E-9); + } + + @Test(dataProvider = "log1mexpData") + public void testLog1mexp(final double x, final double expected) { + final double actual = MathUtils.log1mexp(x); + if (Double.isNaN(expected)) + Assert.assertTrue(Double.isNaN(actual)); + else + Assert.assertEquals(actual,expected,1E-9); + } + + @DataProvider(name = "log10OneMinusPow10Data") + public Iterator log10OneMinusPow10Data() { + + final double[] inValues = new double[] { Double.NaN, 10, 1, 0, -1, -3, -10, -30, -100, -300, -1000, -3000 }; + return new Iterator() { + + private int i = 0; + + @Override + public boolean hasNext() { + return i < inValues.length; + + } + + @Override + public Object[] next() { + final double input = inValues[i++]; + final double output = Math.log10( 1 - Math.pow(10,input)); + return new Object[] { input, output }; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + + @DataProvider(name = "log1mexpData") + public Iterator log1mexpData() { + + final double[] inValues = new double[] { Double.NaN, 10, 1, 0, -1, -3, -10, -30, -100, -300, -1000, -3000 }; + return new Iterator() { + + private int i = 0; + + @Override + public boolean hasNext() { + return i < inValues.length; + + } + + @Override + public Object[] next() { + final double input = inValues[i++]; + final double output = Math.log( 1 - Math.exp(input)); + return new Object[] { input, output }; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + + /** + * Tests that we get the right values from the binomial distribution + */ + @Test + public void testBinomialProbability() { + logger.warn("Executing testBinomialProbability"); + + Assert.assertEquals(MathUtils.binomialProbability(3, 2, 0.5), 0.375, 0.0001); + Assert.assertEquals(MathUtils.binomialProbability(100, 10, 0.5), 1.365543e-17, 1e-18); + Assert.assertEquals(MathUtils.binomialProbability(217, 73, 0.02), 4.521904e-67, 1e-68); + Assert.assertEquals(MathUtils.binomialProbability(300, 100, 0.02), 9.27097e-91, 1e-92); + Assert.assertEquals(MathUtils.binomialProbability(300, 150, 0.98), 6.462892e-168, 1e-169); + Assert.assertEquals(MathUtils.binomialProbability(300, 120, 0.98), 3.090054e-221, 1e-222); + Assert.assertEquals(MathUtils.binomialProbability(300, 112, 0.98), 2.34763e-236, 1e-237); + } + + /** + * Tests that we get the right values from the binomial distribution + */ + @Test + public void testCumulativeBinomialProbability() { + logger.warn("Executing testCumulativeBinomialProbability"); + + for (int j = 0; j < 2; j++) { // Test memoizing functionality, as well. + final int numTrials = 10; + for ( int i = 0; i < numTrials; i++ ) + Assert.assertEquals(MathUtils.binomialCumulativeProbability(numTrials, i, i), MathUtils.binomialProbability(numTrials, i), 1e-10, String.format("k=%d, n=%d", i, numTrials)); + + Assert.assertEquals(MathUtils.binomialCumulativeProbability(10, 0, 2), 0.05468750, 1e-7); + Assert.assertEquals(MathUtils.binomialCumulativeProbability(10, 0, 5), 0.62304687, 1e-7); + Assert.assertEquals(MathUtils.binomialCumulativeProbability(10, 0, 10), 1.0, 1e-7); + } + } + + /** + * Tests that we get the right values from the multinomial distribution + */ + @Test + public void testMultinomialProbability() { + logger.warn("Executing testMultinomialProbability"); + + int[] counts0 = {2, 0, 1}; + double[] probs0 = {0.33, 0.33, 0.34}; + Assert.assertEquals(MathUtils.multinomialProbability(counts0, probs0), 0.111078, 1e-6); + + int[] counts1 = {10, 20, 30}; + double[] probs1 = {0.25, 0.25, 0.50}; + Assert.assertEquals(MathUtils.multinomialProbability(counts1, probs1), 0.002870301, 1e-9); + + int[] counts2 = {38, 82, 50, 36}; + double[] probs2 = {0.25, 0.25, 0.25, 0.25}; + Assert.assertEquals(MathUtils.multinomialProbability(counts2, probs2), 1.88221e-09, 1e-10); + + int[] counts3 = {1, 600, 1}; + double[] probs3 = {0.33, 0.33, 0.34}; + Assert.assertEquals(MathUtils.multinomialProbability(counts3, probs3), 5.20988e-285, 1e-286); + } + + /** + * Tests that the random index selection is working correctly + */ + @Test + public void testRandomIndicesWithReplacement() { + logger.warn("Executing testRandomIndicesWithReplacement"); + + // Check that the size of the list returned is correct + Assert.assertTrue(MathUtils.sampleIndicesWithReplacement(5, 0).size() == 0); + Assert.assertTrue(MathUtils.sampleIndicesWithReplacement(5, 1).size() == 1); + Assert.assertTrue(MathUtils.sampleIndicesWithReplacement(5, 5).size() == 5); + Assert.assertTrue(MathUtils.sampleIndicesWithReplacement(5, 1000).size() == 1000); + + // Check that the list contains only the k element range that as asked for - no more, no less + List Five = new ArrayList<>(); + Collections.addAll(Five, 0, 1, 2, 3, 4); + List BigFive = MathUtils.sampleIndicesWithReplacement(5, 10000); + Assert.assertTrue(BigFive.containsAll(Five)); + Assert.assertTrue(Five.containsAll(BigFive)); + } + + /** + * Tests that we get the right values from the multinomial distribution + */ + @Test + public void testSliceListByIndices() { + logger.warn("Executing testSliceListByIndices"); + + // Check that the list contains only the k element range that as asked for - no more, no less but now + // use the index list to pull elements from another list using sliceListByIndices + List Five = new ArrayList<>(); + Collections.addAll(Five, 0, 1, 2, 3, 4); + List FiveAlpha = new ArrayList<>(); + Collections.addAll(FiveAlpha, 'a', 'b', 'c', 'd', 'e'); + List BigFive = MathUtils.sampleIndicesWithReplacement(5, 10000); + List BigFiveAlpha = MathUtils.sliceListByIndices(BigFive, FiveAlpha); + Assert.assertTrue(BigFiveAlpha.containsAll(FiveAlpha)); + Assert.assertTrue(FiveAlpha.containsAll(BigFiveAlpha)); + } + + /** + * Tests that we correctly compute mean and standard deviation from a stream of numbers + */ + @Test + public void testRunningAverage() { + logger.warn("Executing testRunningAverage"); + + int[] numbers = {1, 2, 4, 5, 3, 128, 25678, -24}; + MathUtils.RunningAverage r = new MathUtils.RunningAverage(); + + for (final double b : numbers) + r.add(b); + + Assert.assertEquals((long) numbers.length, r.observationCount()); + Assert.assertTrue(r.mean() - 3224.625 < 2e-10); + Assert.assertTrue(r.stddev() - 9072.6515881128 < 2e-10); + } + + @Test + public void testLog10Gamma() { + logger.warn("Executing testLog10Gamma"); + + Assert.assertEquals(MathUtils.log10Gamma(4.0), 0.7781513, 1e-6); + Assert.assertEquals(MathUtils.log10Gamma(10), 5.559763, 1e-6); + Assert.assertEquals(MathUtils.log10Gamma(10654), 38280.53, 1e-2); + } + + @Test + public void testLog10BinomialCoefficient() { + logger.warn("Executing testLog10BinomialCoefficient"); + // note that we can test the binomial coefficient calculation indirectly via Newton's identity + // (1+z)^m = sum (m choose k)z^k + double[] z_vals = new double[]{0.999,0.9,0.8,0.5,0.2,0.01,0.0001}; + int[] exponent = new int[]{5,15,25,50,100}; + for ( double z : z_vals ) { + double logz = Math.log10(z); + for ( int exp : exponent ) { + double expected_log = exp*Math.log10(1+z); + double[] newtonArray_log = new double[1+exp]; + for ( int k = 0 ; k <= exp; k++ ) { + newtonArray_log[k] = MathUtils.log10BinomialCoefficient(exp,k)+k*logz; + } + Assert.assertEquals(MathUtils.log10sumLog10(newtonArray_log),expected_log,1e-6); + } + } + + Assert.assertEquals(MathUtils.log10BinomialCoefficient(4, 2), 0.7781513, 1e-6); + Assert.assertEquals(MathUtils.log10BinomialCoefficient(10, 3), 2.079181, 1e-6); + Assert.assertEquals(MathUtils.log10BinomialCoefficient(103928, 119), 400.2156, 1e-4); + } + + @Test + public void testFactorial() { + logger.warn("Executing testFactorial"); + Assert.assertEquals((int) MathUtils.factorial(4), 24); + Assert.assertEquals((int) MathUtils.factorial(10), 3628800); + Assert.assertEquals((int) MathUtils.factorial(12), 479001600); + } + + @Test + public void testLog10Factorial() { + logger.warn("Executing testLog10Factorial"); + Assert.assertEquals(MathUtils.log10Factorial(4), 1.380211, 1e-6); + Assert.assertEquals(MathUtils.log10Factorial(10), 6.559763, 1e-6); + Assert.assertEquals(MathUtils.log10Factorial(12), 8.680337, 1e-6); + Assert.assertEquals(MathUtils.log10Factorial(200), 374.8969, 1e-3); + Assert.assertEquals(MathUtils.log10Factorial(12342), 45138.26, 1e-1); + double log10factorial_small = 0; + double log10factorial_middle = 374.8969; + double log10factorial_large = 45138.26; + int small_start = 1; + int med_start = 200; + int large_start = 12342; + for ( int i = 1; i < 1000; i++ ) { + log10factorial_small += Math.log10(i+small_start); + log10factorial_middle += Math.log10(i+med_start); + log10factorial_large += Math.log10(i+large_start); + Assert.assertEquals(MathUtils.log10Factorial(small_start+i),log10factorial_small,1e-6); + Assert.assertEquals(MathUtils.log10Factorial(med_start+i),log10factorial_middle,1e-3); + Assert.assertEquals(MathUtils.log10Factorial(large_start+i),log10factorial_large,1e-1); + } + } + + @Test + public void testApproximateLog10SumLog10() { + + final double requiredPrecision = 1E-4; + + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0}), 0.0, requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-5.15}), -5.15, requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {130.0}), 130.0, requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-0.145}), -0.145, requiredPrecision); + + Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, 0.0), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-1.0, 0.0), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, -1.0), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-2.2, -3.5), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-1.0, -7.1), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(5.0, 6.2), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(38.1, 16.2), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-38.1, 6.2), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-19.1, -37.1), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-29.1, -27.6), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-0.12345, -0.23456), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-15.7654, -17.0101), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-0.12345, Double.NEGATIVE_INFINITY), -0.12345, requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-15.7654, Double.NEGATIVE_INFINITY), -15.7654, requiredPrecision); + + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0, 0.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-1.0, 0.0}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0, -1.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-2.2, -3.5}), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-1.0, -7.1}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {5.0, 6.2}), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {38.1, 16.2}), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-38.1, 6.2}), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-19.1, -37.1}), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-29.1, -27.6}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-0.12345, -0.23456}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-15.7654, -17.0101}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101)), requiredPrecision); + + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0, 0.0, 0.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-1.0, 0.0, 0.0}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0, -1.0, -2.5}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0) + Math.pow(10.0, -2.5)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-2.2, -3.5, -1.1}), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5) + Math.pow(10.0, -1.1)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-1.0, -7.1, 0.5}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1) + Math.pow(10.0, 0.5)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {5.0, 6.2, 1.3}), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2) + Math.pow(10.0, 1.3)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {38.1, 16.2, 18.1}), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2) + Math.pow(10.0, 18.1)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-38.1, 6.2, 26.6}), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2) + Math.pow(10.0, 26.6)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-19.1, -37.1, -45.1}), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1) + Math.pow(10.0, -45.1)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-29.1, -27.6, -26.2}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6) + Math.pow(10.0, -26.2)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-0.12345, -0.23456, -0.34567}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456) + Math.pow(10.0, -0.34567)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-15.7654, -17.0101, -17.9341}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101) + Math.pow(10.0, -17.9341)), requiredPrecision); + + Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, 0.0, 0.0), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-1.0, 0.0, 0.0), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, -1.0, -2.5), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0) + Math.pow(10.0, -2.5)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-2.2, -3.5, -1.1), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5) + Math.pow(10.0, -1.1)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-1.0, -7.1, 0.5), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1) + Math.pow(10.0, 0.5)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(5.0, 6.2, 1.3), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2) + Math.pow(10.0, 1.3)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(38.1, 16.2, 18.1), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2) + Math.pow(10.0, 18.1)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-38.1, 6.2, 26.6), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2) + Math.pow(10.0, 26.6)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-19.1, -37.1, -45.1), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1) + Math.pow(10.0, -45.1)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-29.1, -27.6, -26.2), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6) + Math.pow(10.0, -26.2)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-0.12345, -0.23456, -0.34567), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456) + Math.pow(10.0, -0.34567)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-15.7654, -17.0101, -17.9341), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101) + Math.pow(10.0, -17.9341)), requiredPrecision); + + // magnitude of the sum doesn't matter, so we can combinatorially test this via partitions of unity + double[] mult_partitionFactor = new double[]{0.999,0.98,0.95,0.90,0.8,0.5,0.3,0.1,0.05,0.001}; + int[] n_partitions = new int[] {2,4,8,16,32,64,128,256,512,1028}; + for ( double alpha : mult_partitionFactor ) { + double log_alpha = Math.log10(alpha); + double log_oneMinusAlpha = Math.log10(1-alpha); + for ( int npart : n_partitions ) { + double[] multiplicative = new double[npart]; + double[] equal = new double[npart]; + double remaining_log = 0.0; // realspace = 1 + for ( int i = 0 ; i < npart-1; i++ ) { + equal[i] = -Math.log10(npart); + double piece = remaining_log + log_alpha; // take a*remaining, leaving remaining-a*remaining = (1-a)*remaining + multiplicative[i] = piece; + remaining_log = remaining_log + log_oneMinusAlpha; + } + equal[npart-1] = -Math.log10(npart); + multiplicative[npart-1] = remaining_log; + Assert.assertEquals(MathUtils.approximateLog10SumLog10(equal),0.0,requiredPrecision,String.format("Did not sum to one: k=%d equal partitions.",npart)); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(multiplicative),0.0,requiredPrecision, String.format("Did not sum to one: k=%d multiplicative partitions with alpha=%f",npart,alpha)); + } + } + } + + @Test + public void testLog10sumLog10() { + final double requiredPrecision = 1E-14; + + final double log3 = 0.477121254719662; + Assert.assertEquals(MathUtils.log10sumLog10(new double[]{0.0, 0.0, 0.0}), log3, requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {0.0, 0.0, 0.0}, 0), log3, requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[]{0.0, 0.0, 0.0}, 0, 3), log3, requiredPrecision); + + final double log2 = 0.301029995663981; + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {0.0, 0.0, 0.0}, 0, 2), log2, requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {0.0, 0.0, 0.0}, 0, 1), 0.0, requiredPrecision); + + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {0.0}), 0.0, requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-5.15}), -5.15, requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {130.0}), 130.0, requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-0.145}), -0.145, requiredPrecision); + + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {0.0, 0.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-1.0, 0.0}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {0.0, -1.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-2.2, -3.5}), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-1.0, -7.1}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {5.0, 6.2}), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {38.1, 16.2}), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-38.1, 6.2}), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-19.1, -37.1}), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-29.1, -27.6}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-0.12345, -0.23456}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-15.7654, -17.0101}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101)), requiredPrecision); + + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {0.0, 0.0, 0.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-1.0, 0.0, 0.0}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {0.0, -1.0, -2.5}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0) + Math.pow(10.0, -2.5)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-2.2, -3.5, -1.1}), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5) + Math.pow(10.0, -1.1)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-1.0, -7.1, 0.5}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1) + Math.pow(10.0, 0.5)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {5.0, 6.2, 1.3}), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2) + Math.pow(10.0, 1.3)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {38.1, 16.2, 18.1}), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2) + Math.pow(10.0, 18.1)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-38.1, 6.2, 26.6}), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2) + Math.pow(10.0, 26.6)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-19.1, -37.1, -45.1}), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1) + Math.pow(10.0, -45.1)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-29.1, -27.6, -26.2}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6) + Math.pow(10.0, -26.2)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-0.12345, -0.23456, -0.34567}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456) + Math.pow(10.0, -0.34567)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-15.7654, -17.0101, -17.9341}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101) + Math.pow(10.0, -17.9341)), requiredPrecision); + + // magnitude of the sum doesn't matter, so we can combinatorially test this via partitions of unity + double[] mult_partitionFactor = new double[]{0.999,0.98,0.95,0.90,0.8,0.5,0.3,0.1,0.05,0.001}; + int[] n_partitions = new int[] {2,4,8,16,32,64,128,256,512,1028}; + for ( double alpha : mult_partitionFactor ) { + double log_alpha = Math.log10(alpha); + double log_oneMinusAlpha = Math.log10(1-alpha); + for ( int npart : n_partitions ) { + double[] multiplicative = new double[npart]; + double[] equal = new double[npart]; + double remaining_log = 0.0; // realspace = 1 + for ( int i = 0 ; i < npart-1; i++ ) { + equal[i] = -Math.log10(npart); + double piece = remaining_log + log_alpha; // take a*remaining, leaving remaining-a*remaining = (1-a)*remaining + multiplicative[i] = piece; + remaining_log = remaining_log + log_oneMinusAlpha; + } + equal[npart-1] = -Math.log10(npart); + multiplicative[npart-1] = remaining_log; + Assert.assertEquals(MathUtils.log10sumLog10(equal),0.0,requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(multiplicative),0.0,requiredPrecision,String.format("Did not sum to one: nPartitions=%d, alpha=%f",npart,alpha)); + } + } + } + + @Test + public void testLogDotProduct() { + Assert.assertEquals(MathUtils.logDotProduct(new double[]{-5.0,-3.0,2.0}, new double[]{6.0,7.0,8.0}),10.0,1e-3); + Assert.assertEquals(MathUtils.logDotProduct(new double[]{-5.0}, new double[]{6.0}),1.0,1e-3); + } + + @Test + public void testNormalDistribution() { + final double requiredPrecision = 1E-10; + + final Normal n = new Normal(0.0, 1.0, null); + for( final double mu : new double[]{-5.0, -3.2, -1.5, 0.0, 1.2, 3.0, 5.8977} ) { + for( final double sigma : new double[]{1.2, 3.0, 5.8977} ) { + for( final double x : new double[]{-5.0, -3.2, -1.5, 0.0, 1.2, 3.0, 5.8977} ) { + n.setState(mu, sigma); + Assert.assertEquals(n.pdf(x), MathUtils.normalDistribution(mu, sigma, x), requiredPrecision); + Assert.assertEquals(Math.log10(n.pdf(x)), MathUtils.normalDistributionLog10(mu, sigma, x), requiredPrecision); + } + } + } + } + + @DataProvider(name = "ArrayMinData") + public Object[][] makeArrayMinData() { + List tests = new ArrayList<>(); + + // this functionality can be adapted to provide input data for whatever you might want in your data + tests.add(new Object[]{Arrays.asList(10), 10}); + tests.add(new Object[]{Arrays.asList(-10), -10}); + + for ( final List values : Utils.makePermutations(Arrays.asList(1,2,3), 3, false) ) { + tests.add(new Object[]{values, 1}); + } + + for ( final List values : Utils.makePermutations(Arrays.asList(1,2,-3), 3, false) ) { + tests.add(new Object[]{values, -3}); + } + + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "ArrayMinData") + public void testArrayMinList(final List values, final int expected) { + final int actual = MathUtils.arrayMin(values); + Assert.assertEquals(actual, expected, "Failed with " + values); + } + + @Test(dataProvider = "ArrayMinData") + public void testArrayMinIntArray(final List values, final int expected) { + final int[] asArray = ArrayUtils.toPrimitive(values.toArray(new Integer[values.size()])); + final int actual = MathUtils.arrayMin(asArray); + Assert.assertEquals(actual, expected, "Failed with " + values); + } + + @Test(dataProvider = "ArrayMinData") + public void testArrayMinByteArray(final List values, final int expected) { + final byte[] asArray = new byte[values.size()]; + for ( int i = 0; i < values.size(); i++ ) asArray[i] = (byte)(values.get(i) & 0xFF); + final byte actual = MathUtils.arrayMin(asArray); + Assert.assertEquals(actual, (byte)(expected & 0xFF), "Failed with " + values); + } + + @Test(dataProvider = "ArrayMinData") + public void testArrayMinDoubleArray(final List values, final int expected) { + final double[] asArray = new double[values.size()]; + for ( int i = 0; i < values.size(); i++ ) asArray[i] = (double)(values.get(i)); + final double actual = MathUtils.arrayMin(asArray); + Assert.assertEquals(actual, (double)expected, "Failed with " + values); + } + + @DataProvider(name = "MedianData") + public Object[][] makeMedianData() { + final List tests = new ArrayList<>(); + + // this functionality can be adapted to provide input data for whatever you might want in your data + tests.add(new Object[]{Arrays.asList(10), 10}); + tests.add(new Object[]{Arrays.asList(1, 10), 10}); + + for ( final List values : Utils.makePermutations(Arrays.asList(1,2,-3), 3, false) ) { + tests.add(new Object[]{values, 1}); + } + + for ( final List values : Utils.makePermutations(Arrays.asList(1.1,2.1,-3.1), 3, false) ) { + tests.add(new Object[]{values, 1.1}); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "MedianData") + public void testMedian(final List values, final Comparable expected) { + final Comparable actual = MathUtils.median(values); + Assert.assertEquals(actual, expected, "Failed with " + values); + } + + + + // man. All this to test dirichlet. + + private double[] unwrap(List stuff) { + double[] unwrapped = new double[stuff.size()]; + int idx = 0; + for ( Double d : stuff ) { + unwrapped[idx++] = d == null ? 0.0 : d; + } + + return unwrapped; + } + + /** + * The PartitionGenerator generates all of the partitions of a number n, e.g. + * 5 + 0 + * 4 + 1 + * 3 + 2 + * 3 + 1 + 1 + * 2 + 2 + 1 + * 2 + 1 + 1 + 1 + * 1 + 1 + 1 + 1 + 1 + * + * This is used to help enumerate the state space over which the Dirichlet-Multinomial is defined, + * to ensure that the distribution function is properly implemented + */ + class PartitionGenerator implements Iterator> { + // generate the partitions of an integer, each partition sorted numerically + int n; + List a; + + int y; + int k; + int state; + + int x; + int l; + + public PartitionGenerator(int n) { + this.n = n; + this.y = n - 1; + this.k = 1; + this.a = new ArrayList<>(); + for ( int i = 0; i < n; i++ ) { + this.a.add(i); + } + this.state = 0; + } + + public void remove() { /* do nothing */ } + + public boolean hasNext() { return ! ( this.k == 0 && state == 0 ); } + + private String dataStr() { + return String.format("a = [%s] k = %d y = %d state = %d x = %d l = %d", + Utils.join(",",a), k, y, state, x, l); + } + + public List next() { + if ( this.state == 0 ) { + this.x = a.get(k-1)+1; + k -= 1; + this.state = 1; + } + + if ( this.state == 1 ) { + while ( 2 * x <= y ) { + this.a.set(k,x); + this.y -= (int) x; + this.k++; + } + this.l = 1+this.k; + this.state = 2; + } + + if ( this.state == 2 ) { + if ( x <= y ) { + this.a.set(k,x); + this.a.set(l,y); + x += 1; + y -= 1; + return this.a.subList(0, this.k + 2); + } else { + this.state =3; + } + } + + if ( this.state == 3 ) { + this.a.set(k,x+y); + this.y = x + y - 1; + this.state = 0; + return a.subList(0, k + 1); + } + + throw new IllegalStateException("Cannot get here"); + } + + public String toString() { + final StringBuilder buf = new StringBuilder(); + buf.append("{ "); + while ( hasNext() ) { + buf.append("["); + buf.append(Utils.join(",",next())); + buf.append("],"); + } + buf.deleteCharAt(buf.lastIndexOf(",")); + buf.append(" }"); + return buf.toString(); + } + + } + + /** + * NextCounts is the enumerator over the state space of the multinomial dirichlet. + * + * It filters the partition of the total sum to only those with a number of terms + * equal to the number of categories. + * + * It then generates all permutations of that partition. + * + * In so doing it enumerates over the full state space. + */ + class NextCounts implements Iterator { + + private PartitionGenerator partitioner; + private int numCategories; + private int[] next; + + public NextCounts(int numCategories, int totalCounts) { + partitioner = new PartitionGenerator(totalCounts); + this.numCategories = numCategories; + next = nextFromPartitioner(); + } + + public void remove() { /* do nothing */ } + + public boolean hasNext() { return next != null; } + + public int[] next() { + int[] toReturn = clone(next); + next = nextPermutation(); + if ( next == null ) { + next = nextFromPartitioner(); + } + + return toReturn; + } + + private int[] clone(int[] arr) { + return Arrays.copyOf(arr, arr.length); + } + + private int[] nextFromPartitioner() { + if ( partitioner.hasNext() ) { + List nxt = partitioner.next(); + while ( partitioner.hasNext() && nxt.size() > numCategories ) { + nxt = partitioner.next(); + } + + if ( nxt.size() > numCategories ) { + return null; + } else { + int[] buf = new int[numCategories]; + for ( int idx = 0; idx < nxt.size(); idx++ ) { + buf[idx] = nxt.get(idx); + } + Arrays.sort(buf); + return buf; + } + } + + return null; + } + + public int[] nextPermutation() { + return MathUtilsUnitTest.nextPermutation(next); + } + + } + + public static int[] nextPermutation(int[] next) { + // the counts can swap among each other. The int[] is originally in ascending order + // this generates the next array in lexicographic order descending + + // locate the last occurrence where next[k] < next[k+1] + int gt = -1; + for ( int idx = 0; idx < next.length-1; idx++) { + if ( next[idx] < next[idx+1] ) { + gt = idx; + } + } + + if ( gt == -1 ) { + return null; + } + + int largestLessThan = gt+1; + for ( int idx = 1 + largestLessThan; idx < next.length; idx++) { + if ( next[gt] < next[idx] ) { + largestLessThan = idx; + } + } + + int val = next[gt]; + next[gt] = next[largestLessThan]; + next[largestLessThan] = val; + + // reverse the tail of the array + int[] newTail = new int[next.length-gt-1]; + int ctr = 0; + for ( int idx = next.length-1; idx > gt; idx-- ) { + newTail[ctr++] = next[idx]; + } + + for ( int idx = 0; idx < newTail.length; idx++) { + next[gt+idx+1] = newTail[idx]; + } + + return next; + } + + + // before testing the dirichlet multinomial, we need to test the + // classes used to test the dirichlet multinomial + + @Test + public void testPartitioner() { + int[] numsToTest = new int[]{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20}; + int[] expectedSizes = new int[]{1, 2, 3, 5, 7, 11, 15, 22, 30, 42, 56, 77, 101, 135, 176, 231, 297, 385, 490, 627}; + for ( int testNum = 0; testNum < numsToTest.length; testNum++ ) { + PartitionGenerator gen = new PartitionGenerator(numsToTest[testNum]); + int size = 0; + while ( gen.hasNext() ) { + logger.debug(gen.dataStr()); + size += 1; + gen.next(); + } + Assert.assertEquals(size,expectedSizes[testNum], + String.format("Expected %d partitions, observed %s",expectedSizes[testNum],new PartitionGenerator(numsToTest[testNum]).toString())); + } + } + + @Test + public void testNextPermutation() { + int[] arr = new int[]{1,2,3,4}; + int[][] gens = new int[][] { + new int[]{1,2,3,4}, + new int[]{1,2,4,3}, + new int[]{1,3,2,4}, + new int[]{1,3,4,2}, + new int[]{1,4,2,3}, + new int[]{1,4,3,2}, + new int[]{2,1,3,4}, + new int[]{2,1,4,3}, + new int[]{2,3,1,4}, + new int[]{2,3,4,1}, + new int[]{2,4,1,3}, + new int[]{2,4,3,1}, + new int[]{3,1,2,4}, + new int[]{3,1,4,2}, + new int[]{3,2,1,4}, + new int[]{3,2,4,1}, + new int[]{3,4,1,2}, + new int[]{3,4,2,1}, + new int[]{4,1,2,3}, + new int[]{4,1,3,2}, + new int[]{4,2,1,3}, + new int[]{4,2,3,1}, + new int[]{4,3,1,2}, + new int[]{4,3,2,1} }; + for ( int gen = 0; gen < gens.length; gen ++ ) { + for ( int idx = 0; idx < 3; idx++ ) { + Assert.assertEquals(arr[idx],gens[gen][idx], + String.format("Error at generation %d, expected %s, observed %s",gen,Arrays.toString(gens[gen]),Arrays.toString(arr))); + } + arr = nextPermutation(arr); + } + } + + private double[] addEpsilon(double[] counts) { + double[] d = new double[counts.length]; + for ( int i = 0; i < counts.length; i ++ ) { + d[i] = counts[i] + 1e-3; + } + return d; + } + + @Test + public void testDirichletMultinomial() { + List testAlleles = Arrays.asList( + new double[]{80,240}, + new double[]{1,10000}, + new double[]{0,500}, + new double[]{5140,20480}, + new double[]{5000,800,200}, + new double[]{6,3,1000}, + new double[]{100,400,300,800}, + new double[]{8000,100,20,80,2}, + new double[]{90,20000,400,20,4,1280,720,1} + ); + + Assert.assertTrue(! Double.isInfinite(MathUtils.log10Gamma(1e-3)) && ! Double.isNaN(MathUtils.log10Gamma(1e-3))); + + int[] numAlleleSampled = new int[]{2,5,10,20,25}; + for ( double[] alleles : testAlleles ) { + for ( int count : numAlleleSampled ) { + // test that everything sums to one. Generate all multinomial draws + List likelihoods = new ArrayList<>(100000); + NextCounts generator = new NextCounts(alleles.length,count); + double maxLog = Double.MIN_VALUE; + //List countLog = new ArrayList(200); + while ( generator.hasNext() ) { + int[] thisCount = generator.next(); + //countLog.add(Arrays.toString(thisCount)); + Double likelihood = MathUtils.dirichletMultinomial(addEpsilon(alleles),thisCount); + Assert.assertTrue(! Double.isNaN(likelihood) && ! Double.isInfinite(likelihood), + String.format("Likelihood for counts %s and nAlleles %d was %s", + Arrays.toString(thisCount),alleles.length,Double.toString(likelihood))); + if ( likelihood > maxLog ) + maxLog = likelihood; + likelihoods.add(likelihood); + } + //System.out.printf("%d likelihoods and max is (probability) %e\n",likelihoods.size(),Math.pow(10,maxLog)); + Assert.assertEquals(MathUtils.sumLog10(unwrap(likelihoods)),1.0,1e-7, + String.format("Counts %d and alleles %d have nLikelihoods %d. \n Counts: %s", + count,alleles.length,likelihoods.size(), "NODEBUG"/*,countLog*/)); + } + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/MedianUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/MedianUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/MedianUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/MedianUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/NGSPlatformUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/NGSPlatformUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/NGSPlatformUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/NGSPlatformUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/PathUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/PathUtilsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/PathUtilsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/PathUtilsUnitTest.java diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/QualityUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/QualityUtilsUnitTest.java new file mode 100644 index 000000000..c8cbeeaf2 --- /dev/null +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/QualityUtilsUnitTest.java @@ -0,0 +1,189 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: 3/21/12 + */ + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.List; + +/** + * Basic unit test for QualityUtils class + */ +public class QualityUtilsUnitTest extends BaseTest { + final private static double TOLERANCE = 1e-9; + + @BeforeClass + public void init() { + } + + @DataProvider(name = "QualTest") + public Object[][] makeMyDataProvider() { + final List tests = new ArrayList<>(); + + for ( int qual = 0; qual < 255; qual++ ) { + tests.add(new Object[]{(byte)(qual & 0xFF), Math.pow(10.0, ((double)qual)/-10.0)}); + } + + return tests.toArray(new Object[][]{}); + } + + /** + * Example testng test using MyDataProvider + */ + @Test(dataProvider = "QualTest") + public void testMyData(final byte qual, final double errorRate) { + final double trueRate = 1 - errorRate; + + final double actualErrorRate = QualityUtils.qualToErrorProb(qual); + Assert.assertEquals(actualErrorRate, errorRate, TOLERANCE); + final double actualTrueRate = QualityUtils.qualToProb(qual); + Assert.assertEquals(actualTrueRate, trueRate, TOLERANCE); + + // log10 tests + final double actualLog10ErrorRate = QualityUtils.qualToErrorProbLog10(qual); + Assert.assertEquals(actualLog10ErrorRate, Math.log10(errorRate), TOLERANCE); + final double actualLog10TrueRate = QualityUtils.qualToProbLog10(qual); + Assert.assertEquals(actualLog10TrueRate, Math.log10(trueRate), TOLERANCE); + + // test that we can convert our error rates to quals, accounting for boundaries + final int expectedQual = Math.max(Math.min(qual & 0xFF, QualityUtils.MAX_SAM_QUAL_SCORE), 1); + final byte actualQual = QualityUtils.trueProbToQual(trueRate); + Assert.assertEquals(actualQual, expectedQual & 0xFF); + final byte actualQualFromErrorRate = QualityUtils.errorProbToQual(errorRate); + Assert.assertEquals(actualQualFromErrorRate, expectedQual & 0xFF); + + for ( int maxQual = 10; maxQual < QualityUtils.MAX_SAM_QUAL_SCORE; maxQual++ ) { + final byte maxAsByte = (byte)(maxQual & 0xFF); + final byte expectedQual2 = (byte)(Math.max(Math.min(qual & 0xFF, maxQual), 1) & 0xFF); + final byte actualQual2 = QualityUtils.trueProbToQual(trueRate, maxAsByte); + Assert.assertEquals(actualQual2, expectedQual2, "Failed with max " + maxQual); + final byte actualQualFromErrorRate2 = QualityUtils.errorProbToQual(errorRate, maxAsByte); + Assert.assertEquals(actualQualFromErrorRate2, expectedQual2, "Failed with max " + maxQual); + + // test the integer routines + final byte actualQualInt2 = QualityUtils.trueProbToQual(trueRate, maxQual); + Assert.assertEquals(actualQualInt2, expectedQual2, "Failed with max " + maxQual); + final byte actualQualFromErrorRateInt2 = QualityUtils.errorProbToQual(errorRate, maxQual); + Assert.assertEquals(actualQualFromErrorRateInt2, expectedQual2, "Failed with max " + maxQual); + } + } + + @Test + public void testTrueProbWithMinDouble() { + final byte actual = QualityUtils.trueProbToQual(Double.MIN_VALUE); + Assert.assertEquals(actual, 1, "Failed to convert true prob of min double to 1 qual"); + } + + @Test + public void testTrueProbWithVerySmallValue() { + final byte actual = QualityUtils.trueProbToQual(1.7857786272673852E-19); + Assert.assertEquals(actual, 1, "Failed to convert true prob of very small value 1.7857786272673852E-19 to 1 qual"); + } + + @Test + public void testQualCaches() { + Assert.assertEquals(QualityUtils.qualToErrorProb((byte) 20), 0.01, 1e-6); + Assert.assertEquals(QualityUtils.qualToErrorProbLog10((byte) 20), -2.0, 1e-6); + Assert.assertEquals(QualityUtils.qualToProb((byte) 20), 0.99, 1e-6); + Assert.assertEquals(QualityUtils.qualToProbLog10((byte) 20), -0.0043648054, 1e-6); + + Assert.assertEquals(QualityUtils.qualToErrorProb((byte) 30), 0.001, 1e-6); + Assert.assertEquals(QualityUtils.qualToErrorProbLog10((byte) 30), -3.0, 1e-6); + Assert.assertEquals(QualityUtils.qualToProb((byte) 30), 0.999, 1e-6); + Assert.assertEquals(QualityUtils.qualToProbLog10((byte) 30), -0.000434511774, 1e-6); + + Assert.assertEquals(QualityUtils.qualToErrorProb((byte) 40), 0.0001, 1e-6); + Assert.assertEquals(QualityUtils.qualToErrorProbLog10((byte) 40), -4.0, 1e-6); + Assert.assertEquals(QualityUtils.qualToProb((byte) 40), 0.9999, 1e-6); + Assert.assertEquals(QualityUtils.qualToProbLog10((byte) 40), -4.34316198e-5, 1e-6); + } + + @Test() + public void testBoundingDefault() { + for ( int qual = 0; qual < 1000; qual++ ) { + final byte expected = (byte)Math.max(Math.min(qual, QualityUtils.MAX_SAM_QUAL_SCORE), 1); + Assert.assertEquals(QualityUtils.boundQual(qual), expected); + } + } + + @Test() + public void testBoundingWithMax() { + for ( int max = 10; max < 255; max += 50 ) { + for ( int qual = 0; qual < 1000; qual++ ) { + final int expected = Math.max(Math.min(qual, max), 1); + Assert.assertEquals(QualityUtils.boundQual(qual, (byte)(max & 0xFF)) & 0xFF, expected & 0xFF, "qual " + qual + " max " + max); + } + } + } + + @DataProvider(name = "PhredScaleDoubleOps") + public Object[][] makePhredDoubleTest() { + final List tests = new ArrayList<>(); + + tests.add(new Object[]{0.0, -10 * Math.log10(Double.MIN_VALUE)}); + tests.add(new Object[]{1.0, 0.0}); + for ( int pow = 1; pow < 20; pow++ ) { + tests.add(new Object[]{Math.pow(10.0, -1.0 * pow), pow * 10}); + tests.add(new Object[]{Math.pow(10.0, -1.5 * pow), pow * 15}); + } + + return tests.toArray(new Object[][]{}); + } + + @Test() + public void testQualToErrorProbDouble() { + for ( double qual = 3.0; qual < 255.0; qual += 0.1 ) { + final double expected = Math.pow(10.0, qual / -10.0); + Assert.assertEquals(QualityUtils.qualToErrorProb(qual), expected, TOLERANCE, "failed qual->error prob for double qual " + qual); + } + } + + + @Test(dataProvider = "PhredScaleDoubleOps") + public void testPhredScaleDoubleOps(final double errorRate, final double expectedPhredScaled) { + final double actualError = QualityUtils.phredScaleErrorRate(errorRate); + Assert.assertEquals(actualError, expectedPhredScaled, TOLERANCE); + final double trueRate = 1 - errorRate; + final double actualTrue = QualityUtils.phredScaleCorrectRate(trueRate); + if ( trueRate == 1.0 ) { + Assert.assertEquals(actualTrue, QualityUtils.MIN_PHRED_SCALED_QUAL); + } else { + final double tol = errorRate < 1e-10 ? 10.0 : 1e-3; + Assert.assertEquals(actualTrue, expectedPhredScaled, tol); + } + } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/R/RScriptExecutorUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/R/RScriptExecutorUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/R/RScriptExecutorUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/R/RScriptExecutorUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/R/RScriptLibraryUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/R/RScriptLibraryUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/R/RScriptLibraryUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/R/RScriptLibraryUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/R/RUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/R/RUtilsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/R/RUtilsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/R/RUtilsUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/SequenceDictionaryUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/SequenceDictionaryUtilsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/SequenceDictionaryUtilsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/SequenceDictionaryUtilsUnitTest.java diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java new file mode 100644 index 000000000..85b79a00f --- /dev/null +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java @@ -0,0 +1,179 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.lang.reflect.Field; + +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.TimeUnit; + +public class SimpleTimerUnitTest extends BaseTest { + private final static String NAME = "unit.test.timer"; + + @Test + public void testSimpleTimer() { + SimpleTimer t = new SimpleTimer(NAME); + Assert.assertEquals(t.getName(), NAME, "Name is not the provided one"); + Assert.assertFalse(t.isRunning(), "Initial state of the timer is running"); + Assert.assertEquals(t.getElapsedTime(), 0.0, "New timer elapsed time should be 0"); + Assert.assertEquals(t.getElapsedTimeNano(), 0l, "New timer elapsed time nano should be 0"); + + t.start(); + Assert.assertTrue(t.isRunning(), "Started timer isn't running"); + Assert.assertTrue(t.getElapsedTime() >= 0.0, "Elapsed time should be >= 0"); + Assert.assertTrue(t.getElapsedTimeNano() >= 0.0, "Elapsed time nano should be >= 0"); + long n1 = t.getElapsedTimeNano(); + double t1 = t.getElapsedTime(); + idleLoop(); // idle loop to wait a tiny bit of time + long n2 = t.getElapsedTimeNano(); + double t2 = t.getElapsedTime(); + Assert.assertTrue(t2 >= t1, "T2 >= T1 for a running time"); + Assert.assertTrue(n2 >= n1, "T2 >= T1 nano for a running time"); + + t.stop(); + Assert.assertFalse(t.isRunning(), "Stopped timer still running"); + long n3 = t.getElapsedTimeNano(); + double t3 = t.getElapsedTime(); + idleLoop(); // idle loop to wait a tiny bit of time + double t4 = t.getElapsedTime(); + long n4 = t.getElapsedTimeNano(); + Assert.assertTrue(t4 == t3, "Elapsed times for two calls of stop timer not the same"); + Assert.assertTrue(n4 == n3, "Elapsed times for two calls of stop timer not the same"); + + t.restart(); + idleLoop(); // idle loop to wait a tiny bit of time + double t5 = t.getElapsedTime(); + long n5 = t.getElapsedTimeNano(); + Assert.assertTrue(t.isRunning(), "Restarted timer should be running"); + idleLoop(); // idle loop to wait a tiny bit of time + double t6 = t.getElapsedTime(); + long n6 = t.getElapsedTimeNano(); + Assert.assertTrue(t5 >= t4, "Restarted timer elapsed time should be after elapsed time preceding the restart"); + Assert.assertTrue(t6 >= t5, "Second elapsed time not after the first in restarted timer"); + Assert.assertTrue(n5 >= n4, "Restarted timer elapsed time nano should be after elapsed time preceding the restart"); + Assert.assertTrue(n6 >= n5, "Second elapsed time nano not after the first in restarted timer"); + + final List secondTimes = Arrays.asList(t1, t2, t3, t4, t5, t6); + final List nanoTimes = Arrays.asList(n1, n2, n3, n4, n5, n6); + for ( int i = 0; i < nanoTimes.size(); i++ ) + Assert.assertEquals( + SimpleTimer.nanoToSecondsAsDouble(nanoTimes.get(i)), + secondTimes.get(i), 1e-1, "Nanosecond and second timer disagree"); + } + + @Test + public void testNanoResolution() { + SimpleTimer t = new SimpleTimer(NAME); + + // test the nanosecond resolution + long n7 = t.currentTimeNano(); + int sum = 0; + for ( int i = 0; i < 100; i++) sum += i; + long n8 = t.currentTimeNano(); + final long delta = n8 - n7; + final long oneMilliInNano = TimeUnit.MILLISECONDS.toNanos(1); + logger.warn("nanoTime before nano operation " + n7); + logger.warn("nanoTime after nano operation of summing 100 ints " + n8 + ", sum = " + sum + " time delta " + delta + " vs. 1 millsecond in nano " + oneMilliInNano); + Assert.assertTrue(n8 > n7, "SimpleTimer doesn't appear to have nanoSecond resolution: n8 " + n8 + " <= n7 " + n7); + Assert.assertTrue(delta < oneMilliInNano, + "SimpleTimer doesn't appear to have nanoSecond resolution: time delta is " + delta + " vs 1 millisecond in nano " + oneMilliInNano); + } + + @Test + public void testMeaningfulTimes() { + SimpleTimer t = new SimpleTimer(NAME); + + t.start(); + for ( int i = 0; i < 100; i++ ) ; + long nano = t.getElapsedTimeNano(); + double secs = t.getElapsedTime(); + + Assert.assertTrue(secs > 0, "Seconds timer doesn't appear to count properly: elapsed time is " + secs); + Assert.assertTrue(secs < 0.01, "Fast operation said to take longer than 10 milliseconds: elapsed time in seconds " + secs); + + Assert.assertTrue(nano > 0, "Nanosecond timer doesn't appear to count properly: elapsed time is " + nano); + final long maxTimeInMicro = 10000; + final long maxTimeInNano = TimeUnit.MICROSECONDS.toNanos(maxTimeInMicro); + Assert.assertTrue(nano < maxTimeInNano, "Fast operation said to take longer than " + maxTimeInMicro + " microseconds: elapsed time in nano " + nano + " micro " + TimeUnit.NANOSECONDS.toMicros(nano)); + } + + @Test + public void testCheckpointRestart() throws Exception { + SimpleTimer t = new SimpleTimer(NAME); + + final Field offsetField = t.getClass().getDeclaredField("nanoTimeOffset"); + offsetField.setAccessible(true); + long offset = ((Long) offsetField.get(t)).longValue(); + + t.start(); + idleLoop(); + // Make it as if clock has jumped into the past + offsetField.set(t, offset + TimeUnit.SECONDS.toNanos(10)); + t.stop(); + offset = ((Long) offsetField.get(t)).longValue(); + Assert.assertEquals(t.getElapsedTime(), 0.0, "Time over restart is not zero."); + + t.start(); + idleLoop(); + t.stop(); + offset = ((Long) offsetField.get(t)).longValue(); + double elapsed = t.getElapsedTime(); + Assert.assertTrue(elapsed >= 0.0, "Elapsed time is zero."); + t.restart(); + // Make the clock jump again by just a little + offsetField.set(t, offset + TimeUnit.SECONDS.toNanos(1)); + idleLoop(); + t.stop(); + offset = ((Long) offsetField.get(t)).longValue(); + Assert.assertTrue(t.getElapsedTime() > elapsed, "Small clock drift causing reset."); + elapsed = t.getElapsedTime(); + // Now a bigger jump, into the future this time. + t.restart(); + // Make the clock jump again by a lot + offsetField.set(t, offset - TimeUnit.SECONDS.toNanos(10)); + t.stop(); + Assert.assertEquals(t.getElapsedTime(), elapsed, "Time added over checkpoint/restart."); + + // Test without stopping + t.start(); + offset = ((Long) offsetField.get(t)).longValue(); + // Make it as if clock has jumped into the past + offsetField.set(t, offset + TimeUnit.SECONDS.toNanos(10)); + Assert.assertEquals(t.getElapsedTime(), 0.0, "Elapsed time after C/R is not zero."); + idleLoop(); + Assert.assertTrue(t.getElapsedTime() > 0.0, "Elapsed time zero after re-sync."); + + } + + private static void idleLoop() { + for ( int i = 0; i < 100000; i++ ) ; // idle loop to wait a tiny bit of time + } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/UtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/UtilsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/UtilsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/UtilsUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActiveRegionUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/activeregion/ActiveRegionUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/activeregion/ActiveRegionUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/activeregion/ActiveRegionUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileStateUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/activeregion/ActivityProfileStateUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileStateUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/activeregion/ActivityProfileStateUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/baq/BAQUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/baq/BAQUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/classloader/JVMUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/classloader/JVMUtilsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/classloader/JVMUtilsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/classloader/JVMUtilsUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java diff --git a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/hapmap/HapMapUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/codecs/hapmap/HapMapUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/codecs/hapmap/HapMapUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/codecs/hapmap/HapMapUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/collections/DefaultHashMapUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/collections/DefaultHashMapUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/collections/DefaultHashMapUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/collections/DefaultHashMapUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/collections/ExpandingArrayListUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/collections/ExpandingArrayListUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/collections/ExpandingArrayListUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/collections/ExpandingArrayListUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/crypt/CryptUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/crypt/CryptUtilsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/crypt/CryptUtilsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/crypt/CryptUtilsUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/crypt/GATKKeyIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/crypt/GATKKeyIntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/crypt/GATKKeyIntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/crypt/GATKKeyIntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/crypt/GATKKeyUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/crypt/GATKKeyUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/crypt/GATKKeyUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/crypt/GATKKeyUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/file/FSLockWithSharedUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/file/FSLockWithSharedUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/file/FSLockWithSharedUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/file/FSLockWithSharedUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsBenchmark.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/fragments/FragmentUtilsBenchmark.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsBenchmark.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/fragments/FragmentUtilsBenchmark.java diff --git a/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/haplotype/EventMapUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/haplotype/EventMapUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/haplotype/EventMapUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/haplotype/EventMapUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/haplotype/HaplotypeUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/haplotype/HaplotypeUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/io/IOUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/io/IOUtilsUnitTest.java new file mode 100644 index 000000000..7eca44ee6 --- /dev/null +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/io/IOUtilsUnitTest.java @@ -0,0 +1,326 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.io; + +import org.apache.commons.io.FileUtils; +import org.broadinstitute.sting.BaseTest; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Random; + +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +public class IOUtilsUnitTest extends BaseTest { + @Test + public void testGoodTempDir() { + IOUtils.checkTempDir(new File("/tmp/queue")); + } + + @Test(expectedExceptions=UserException.BadTmpDir.class) + public void testBadTempDir() { + IOUtils.checkTempDir(new File("/tmp")); + } + + @Test + public void testAbsoluteSubDir() { + File subDir = IOUtils.absolute(new File("."), new File("/path/to/file")); + Assert.assertEquals(subDir, new File("/path/to/file")); + + subDir = IOUtils.absolute(new File("/different/path"), new File("/path/to/file")); + Assert.assertEquals(subDir, new File("/path/to/file")); + + subDir = IOUtils.absolute(new File("/different/path"), new File(".")); + Assert.assertEquals(subDir, new File("/different/path")); + } + + @Test + public void testRelativeSubDir() throws IOException { + File subDir = IOUtils.absolute(new File("."), new File("path/to/file")); + Assert.assertEquals(subDir.getCanonicalFile(), new File("path/to/file").getCanonicalFile()); + + subDir = IOUtils.absolute(new File("/different/path"), new File("path/to/file")); + Assert.assertEquals(subDir, new File("/different/path/path/to/file")); + } + + @Test + public void testDottedSubDir() throws IOException { + File subDir = IOUtils.absolute(new File("."), new File("path/../to/file")); + Assert.assertEquals(subDir.getCanonicalFile(), new File("path/../to/./file").getCanonicalFile()); + + subDir = IOUtils.absolute(new File("."), new File("/path/../to/file")); + Assert.assertEquals(subDir, new File("/path/../to/file")); + + subDir = IOUtils.absolute(new File("/different/../path"), new File("path/to/file")); + Assert.assertEquals(subDir, new File("/different/../path/path/to/file")); + + subDir = IOUtils.absolute(new File("/different/./path"), new File("/path/../to/file")); + Assert.assertEquals(subDir, new File("/path/../to/file")); + } + + @Test + public void testTempDir() { + File tempDir = IOUtils.tempDir("Q-Unit-Test", "", new File("queueTempDirToDelete")); + Assert.assertTrue(tempDir.exists()); + Assert.assertFalse(tempDir.isFile()); + Assert.assertTrue(tempDir.isDirectory()); + boolean deleted = IOUtils.tryDelete(tempDir); + Assert.assertTrue(deleted); + Assert.assertFalse(tempDir.exists()); + } + + @Test + public void testDirLevel() { + File dir = IOUtils.dirLevel(new File("/path/to/directory"), 1); + Assert.assertEquals(dir, new File("/path")); + + dir = IOUtils.dirLevel(new File("/path/to/directory"), 2); + Assert.assertEquals(dir, new File("/path/to")); + + dir = IOUtils.dirLevel(new File("/path/to/directory"), 3); + Assert.assertEquals(dir, new File("/path/to/directory")); + + dir = IOUtils.dirLevel(new File("/path/to/directory"), 4); + Assert.assertEquals(dir, new File("/path/to/directory")); + } + + @Test + public void testAbsolute() { + File dir = IOUtils.absolute(new File("/path/./to/./directory/.")); + Assert.assertEquals(dir, new File("/path/to/directory")); + + dir = IOUtils.absolute(new File("/")); + Assert.assertEquals(dir, new File("/")); + + dir = IOUtils.absolute(new File("/.")); + Assert.assertEquals(dir, new File("/")); + + dir = IOUtils.absolute(new File("/././.")); + Assert.assertEquals(dir, new File("/")); + + dir = IOUtils.absolute(new File("/./directory/.")); + Assert.assertEquals(dir, new File("/directory")); + + dir = IOUtils.absolute(new File("/./directory/./")); + Assert.assertEquals(dir, new File("/directory")); + + dir = IOUtils.absolute(new File("/./directory./")); + Assert.assertEquals(dir, new File("/directory.")); + + dir = IOUtils.absolute(new File("/./.directory/")); + Assert.assertEquals(dir, new File("/.directory")); + } + + @Test + public void testTail() throws IOException { + List lines = Arrays.asList( + "chr18_random 4262 3154410390 50 51", + "chr19_random 301858 3154414752 50 51", + "chr21_random 1679693 3154722662 50 51", + "chr22_random 257318 3156435963 50 51", + "chrX_random 1719168 3156698441 50 51"); + List tail = IOUtils.tail(new File(BaseTest.hg18Reference + ".fai"), 5); + Assert.assertEquals(tail.size(), 5); + for (int i = 0; i < 5; i++) + Assert.assertEquals(tail.get(i), lines.get(i)); + } + + @Test + public void testWriteSystemFile() throws IOException { + File temp = createTempFile("temp.", ".properties"); + try { + IOUtils.writeResource(new Resource("testProperties.properties", null), temp); + } finally { + FileUtils.deleteQuietly(temp); + } + } + + @Test + public void testWriteSystemTempFile() throws IOException { + File temp = IOUtils.writeTempResource(new Resource("testProperties.properties", null)); + try { + Assert.assertTrue(temp.getName().startsWith("testProperties"), "File does not start with 'testProperties.': " + temp); + Assert.assertTrue(temp.getName().endsWith(".properties"), "File does not end with '.properties': " + temp); + } finally { + FileUtils.deleteQuietly(temp); + } + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testMissingSystemFile() throws IOException { + File temp = createTempFile("temp.", ".properties"); + try { + IOUtils.writeResource(new Resource("MissingStingText.properties", null), temp); + } finally { + FileUtils.deleteQuietly(temp); + } + } + + @Test + public void testWriteRelativeFile() throws IOException { + File temp = createTempFile("temp.", ".properties"); + try { + IOUtils.writeResource(new Resource("/testProperties.properties", IOUtils.class), temp); + } finally { + FileUtils.deleteQuietly(temp); + } + } + + @Test + public void testWriteRelativeTempFile() throws IOException { + File temp = IOUtils.writeTempResource(new Resource("/testProperties.properties", IOUtils.class)); + try { + Assert.assertTrue(temp.getName().startsWith("testProperties"), "File does not start with 'testProperties.': " + temp); + Assert.assertTrue(temp.getName().endsWith(".properties"), "File does not end with '.properties': " + temp); + } finally { + FileUtils.deleteQuietly(temp); + } + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testMissingRelativeFile() throws IOException { + File temp = createTempFile("temp.", ".properties"); + try { + // Looking for /org/broadinstitute/sting/utils/file/StingText.properties + IOUtils.writeResource(new Resource("StingText.properties", IOUtils.class), temp); + } finally { + FileUtils.deleteQuietly(temp); + } + } + + @Test + public void testResourceProperties() { + Resource resource = new Resource("foo", Resource.class); + Assert.assertEquals(resource.getPath(), "foo"); + Assert.assertEquals(resource.getRelativeClass(), Resource.class); + } + + @Test + public void testIsSpecialFile() { + Assert.assertTrue(IOUtils.isSpecialFile(new File("/dev"))); + Assert.assertTrue(IOUtils.isSpecialFile(new File("/dev/null"))); + Assert.assertTrue(IOUtils.isSpecialFile(new File("/dev/full"))); + Assert.assertTrue(IOUtils.isSpecialFile(new File("/dev/stdout"))); + Assert.assertTrue(IOUtils.isSpecialFile(new File("/dev/stderr"))); + Assert.assertFalse(IOUtils.isSpecialFile(null)); + Assert.assertFalse(IOUtils.isSpecialFile(new File("/home/user/my.file"))); + Assert.assertFalse(IOUtils.isSpecialFile(new File("/devfake/null"))); + } + + @DataProvider( name = "ByteArrayIOTestData") + public Object[][] byteArrayIOTestDataProvider() { + return new Object[][] { + // file size, read buffer size + { 0, 4096 }, + { 1, 4096 }, + { 2000, 4096 }, + { 4095, 4096 }, + { 4096, 4096 }, + { 4097, 4096 }, + { 6000, 4096 }, + { 8191, 4096 }, + { 8192, 4096 }, + { 8193, 4096 }, + { 10000, 4096 } + }; + } + + @Test( dataProvider = "ByteArrayIOTestData" ) + public void testWriteThenReadFileIntoByteArray ( int fileSize, int readBufferSize ) throws Exception { + File tempFile = createTempFile(String.format("testWriteThenReadFileIntoByteArray_%d_%d", fileSize, readBufferSize), "tmp"); + + byte[] dataWritten = getDeterministicRandomData(fileSize); + IOUtils.writeByteArrayToFile(dataWritten, tempFile); + byte[] dataRead = IOUtils.readFileIntoByteArray(tempFile, readBufferSize); + + Assert.assertEquals(dataRead.length, dataWritten.length); + Assert.assertTrue(Arrays.equals(dataRead, dataWritten)); + } + + @Test( dataProvider = "ByteArrayIOTestData" ) + public void testWriteThenReadStreamIntoByteArray ( int fileSize, int readBufferSize ) throws Exception { + File tempFile = createTempFile(String.format("testWriteThenReadStreamIntoByteArray_%d_%d", fileSize, readBufferSize), "tmp"); + + byte[] dataWritten = getDeterministicRandomData(fileSize); + IOUtils.writeByteArrayToStream(dataWritten, new FileOutputStream(tempFile)); + byte[] dataRead = IOUtils.readStreamIntoByteArray(new FileInputStream(tempFile), readBufferSize); + + Assert.assertEquals(dataRead.length, dataWritten.length); + Assert.assertTrue(Arrays.equals(dataRead, dataWritten)); + } + + @Test( expectedExceptions = UserException.CouldNotReadInputFile.class ) + public void testReadNonExistentFileIntoByteArray() { + File nonExistentFile = new File("djfhsdkjghdfk"); + Assert.assertFalse(nonExistentFile.exists()); + + IOUtils.readFileIntoByteArray(nonExistentFile); + } + + @Test( expectedExceptions = ReviewedStingException.class ) + public void testReadNullStreamIntoByteArray() { + IOUtils.readStreamIntoByteArray(null); + } + + @Test( expectedExceptions = ReviewedStingException.class ) + public void testReadStreamIntoByteArrayInvalidBufferSize() throws Exception { + IOUtils.readStreamIntoByteArray(new FileInputStream(createTempFile("testReadStreamIntoByteArrayInvalidBufferSize", "tmp")), + -1); + } + + @Test( expectedExceptions = UserException.CouldNotCreateOutputFile.class ) + public void testWriteByteArrayToUncreatableFile() { + IOUtils.writeByteArrayToFile(new byte[]{0}, new File("/dev/foo/bar")); + } + + @Test( expectedExceptions = ReviewedStingException.class ) + public void testWriteNullByteArrayToFile() { + IOUtils.writeByteArrayToFile(null, createTempFile("testWriteNullByteArrayToFile", "tmp")); + } + + @Test( expectedExceptions = ReviewedStingException.class ) + public void testWriteByteArrayToNullStream() { + IOUtils.writeByteArrayToStream(new byte[]{0}, null); + } + + private byte[] getDeterministicRandomData ( int size ) { + GenomeAnalysisEngine.resetRandomGenerator(); + Random rand = GenomeAnalysisEngine.getRandomGenerator(); + + byte[] randomData = new byte[size]; + rand.nextBytes(randomData); + + return randomData; + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachineUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachineUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachineUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachineUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/locusiterator/LIBS_position.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/locusiterator/LIBS_position.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/locusiterator/LIBS_position.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/locusiterator/LIBS_position.java diff --git a/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorBenchmark.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/locusiterator/LocusIteratorBenchmark.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorBenchmark.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/locusiterator/LocusIteratorBenchmark.java diff --git a/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateBaseTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateBaseTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateBaseTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateBaseTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManagerUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManagerUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManagerUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManagerUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/MapResultUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/nanoScheduler/MapResultUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/nanoScheduler/MapResultUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/nanoScheduler/MapResultUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/nanoScheduler/ReducerUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/nanoScheduler/ReducerUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/pileup/PileupElementUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/pileup/PileupElementUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/pileup/PileupElementUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/pileup/PileupElementUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemonUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemonUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemonUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemonUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDataUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDataUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDataUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDataUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/recalibration/EventTypeUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/recalibration/EventTypeUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/recalibration/EventTypeUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/recalibration/EventTypeUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/report/ReportMarshallerUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/report/ReportMarshallerUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/report/ReportMarshallerUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/report/ReportMarshallerUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/runtime/ProcessControllerUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/runtime/ProcessControllerUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/runtime/ProcessControllerUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/runtime/ProcessControllerUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/runtime/RuntimeUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/runtime/RuntimeUtilsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/runtime/RuntimeUtilsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/runtime/RuntimeUtilsUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilderUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilderUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilderUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilderUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialPatternedSAMIteratorUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/ArtificialPatternedSAMIteratorUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/sam/ArtificialPatternedSAMIteratorUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/ArtificialPatternedSAMIteratorUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSAMFileWriterUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/ArtificialSAMFileWriterUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSAMFileWriterUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/ArtificialSAMFileWriterUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSAMQueryIteratorUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/ArtificialSAMQueryIteratorUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSAMQueryIteratorUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/ArtificialSAMQueryIteratorUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSAMUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/ArtificialSAMUtilsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSAMUtilsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/ArtificialSAMUtilsUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/MisencodedBaseQualityUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/MisencodedBaseQualityUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/sam/MisencodedBaseQualityUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/MisencodedBaseQualityUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/smithwaterman/SmithWatermanBenchmark.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/smithwaterman/SmithWatermanBenchmark.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/smithwaterman/SmithWatermanBenchmark.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/smithwaterman/SmithWatermanBenchmark.java diff --git a/public/java/test/org/broadinstitute/sting/utils/text/ListFileUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/text/ListFileUtilsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/text/ListFileUtilsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/text/ListFileUtilsUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/text/TextFormattingUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/text/TextFormattingUtilsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/text/TextFormattingUtilsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/text/TextFormattingUtilsUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/threading/ThreadPoolMonitorUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/threading/ThreadPoolMonitorUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/threading/ThreadPoolMonitorUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/threading/ThreadPoolMonitorUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/variant/GATKVCFUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/variant/GATKVCFUtilsUnitTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/variant/GATKVCFUtilsUnitTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/variant/GATKVCFUtilsUnitTest.java diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java new file mode 100644 index 000000000..efc701a6d --- /dev/null +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java @@ -0,0 +1,1741 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.variant; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.variant.variantcontext.*; +import org.testng.Assert; +import org.testng.annotations.BeforeSuite; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +public class GATKVariantContextUtilsUnitTest extends BaseTest { + private final static boolean DEBUG = false; + + Allele Aref, T, C, G, Cref, ATC, ATCATC; + Allele ATCATCT; + Allele ATref; + Allele Anoref; + Allele GT; + + @BeforeSuite + public void setup() { + // alleles + Aref = Allele.create("A", true); + Cref = Allele.create("C", true); + T = Allele.create("T"); + C = Allele.create("C"); + G = Allele.create("G"); + ATC = Allele.create("ATC"); + ATCATC = Allele.create("ATCATC"); + ATCATCT = Allele.create("ATCATCT"); + ATref = Allele.create("AT",true); + Anoref = Allele.create("A",false); + GT = Allele.create("GT",false); + } + + private Genotype makeG(String sample, Allele a1, Allele a2, double log10pError, int... pls) { + return new GenotypeBuilder(sample, Arrays.asList(a1, a2)).log10PError(log10pError).PL(pls).make(); + } + + + private Genotype makeG(String sample, Allele a1, Allele a2, double log10pError) { + return new GenotypeBuilder(sample, Arrays.asList(a1, a2)).log10PError(log10pError).make(); + } + + private VariantContext makeVC(String source, List alleles) { + return makeVC(source, alleles, null, null); + } + + private VariantContext makeVC(String source, List alleles, Genotype... g1) { + return makeVC(source, alleles, Arrays.asList(g1)); + } + + private VariantContext makeVC(String source, List alleles, String filter) { + return makeVC(source, alleles, filter.equals(".") ? null : new HashSet(Arrays.asList(filter))); + } + + private VariantContext makeVC(String source, List alleles, Set filters) { + return makeVC(source, alleles, null, filters); + } + + private VariantContext makeVC(String source, List alleles, Collection genotypes) { + return makeVC(source, alleles, genotypes, null); + } + + private VariantContext makeVC(String source, List alleles, Collection genotypes, Set filters) { + int start = 10; + int stop = start + alleles.get(0).length() - 1; // alleles.contains(ATC) ? start + 3 : start; + return new VariantContextBuilder(source, "1", start, stop, alleles).genotypes(genotypes).filters(filters).make(); + } + + // -------------------------------------------------------------------------------- + // + // Test allele merging + // + // -------------------------------------------------------------------------------- + + private class MergeAllelesTest extends TestDataProvider { + List> inputs; + List expected; + + private MergeAllelesTest(List... arg) { + super(MergeAllelesTest.class); + LinkedList> all = new LinkedList<>(Arrays.asList(arg)); + expected = all.pollLast(); + inputs = all; + } + + public String toString() { + return String.format("MergeAllelesTest input=%s expected=%s", inputs, expected); + } + } + @DataProvider(name = "mergeAlleles") + public Object[][] mergeAllelesData() { + // first, do no harm + new MergeAllelesTest(Arrays.asList(Aref), + Arrays.asList(Aref)); + + new MergeAllelesTest(Arrays.asList(Aref), + Arrays.asList(Aref), + Arrays.asList(Aref)); + + new MergeAllelesTest(Arrays.asList(Aref), + Arrays.asList(Aref, T), + Arrays.asList(Aref, T)); + + new MergeAllelesTest(Arrays.asList(Aref, C), + Arrays.asList(Aref, T), + Arrays.asList(Aref, C, T)); + + new MergeAllelesTest(Arrays.asList(Aref, T), + Arrays.asList(Aref, C), + Arrays.asList(Aref, T, C)); // in order of appearence + + new MergeAllelesTest(Arrays.asList(Aref, C, T), + Arrays.asList(Aref, C), + Arrays.asList(Aref, C, T)); + + new MergeAllelesTest(Arrays.asList(Aref, C, T), Arrays.asList(Aref, C, T)); + + new MergeAllelesTest(Arrays.asList(Aref, T, C), Arrays.asList(Aref, T, C)); + + new MergeAllelesTest(Arrays.asList(Aref, T, C), + Arrays.asList(Aref, C), + Arrays.asList(Aref, T, C)); // in order of appearence + + new MergeAllelesTest(Arrays.asList(Aref), + Arrays.asList(Aref, ATC), + Arrays.asList(Aref, ATC)); + + new MergeAllelesTest(Arrays.asList(Aref), + Arrays.asList(Aref, ATC, ATCATC), + Arrays.asList(Aref, ATC, ATCATC)); + + // alleles in the order we see them + new MergeAllelesTest(Arrays.asList(Aref, ATCATC), + Arrays.asList(Aref, ATC, ATCATC), + Arrays.asList(Aref, ATCATC, ATC)); + + // same + new MergeAllelesTest(Arrays.asList(Aref, ATC), + Arrays.asList(Aref, ATCATC), + Arrays.asList(Aref, ATC, ATCATC)); + + new MergeAllelesTest(Arrays.asList(ATref, ATC, Anoref, G), + Arrays.asList(Aref, ATCATC, G), + Arrays.asList(ATref, ATC, Anoref, G, ATCATCT, GT)); + + return MergeAllelesTest.getTests(MergeAllelesTest.class); + } + + @Test(enabled = !DEBUG, dataProvider = "mergeAlleles") + public void testMergeAlleles(MergeAllelesTest cfg) { + final List inputs = new ArrayList(); + + int i = 0; + for ( final List alleles : cfg.inputs ) { + final String name = "vcf" + ++i; + inputs.add(makeVC(name, alleles)); + } + + final List priority = vcs2priority(inputs); + + final VariantContext merged = GATKVariantContextUtils.simpleMerge( + inputs, priority, + GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, + GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, "set", false, false); + + Assert.assertEquals(merged.getAlleles().size(),cfg.expected.size()); + Assert.assertEquals(merged.getAlleles(), cfg.expected); + } + + // -------------------------------------------------------------------------------- + // + // Test rsID merging + // + // -------------------------------------------------------------------------------- + + private class SimpleMergeRSIDTest extends TestDataProvider { + List inputs; + String expected; + + private SimpleMergeRSIDTest(String... arg) { + super(SimpleMergeRSIDTest.class); + LinkedList allStrings = new LinkedList(Arrays.asList(arg)); + expected = allStrings.pollLast(); + inputs = allStrings; + } + + public String toString() { + return String.format("SimpleMergeRSIDTest vc=%s expected=%s", inputs, expected); + } + } + + @DataProvider(name = "simplemergersiddata") + public Object[][] createSimpleMergeRSIDData() { + new SimpleMergeRSIDTest(".", "."); + new SimpleMergeRSIDTest(".", ".", "."); + new SimpleMergeRSIDTest("rs1", "rs1"); + new SimpleMergeRSIDTest("rs1", "rs1", "rs1"); + new SimpleMergeRSIDTest(".", "rs1", "rs1"); + new SimpleMergeRSIDTest("rs1", ".", "rs1"); + new SimpleMergeRSIDTest("rs1", "rs2", "rs1,rs2"); + new SimpleMergeRSIDTest("rs1", "rs2", "rs1", "rs1,rs2"); // duplicates + new SimpleMergeRSIDTest("rs2", "rs1", "rs2,rs1"); + new SimpleMergeRSIDTest("rs2", "rs1", ".", "rs2,rs1"); + new SimpleMergeRSIDTest("rs2", ".", "rs1", "rs2,rs1"); + new SimpleMergeRSIDTest("rs1", ".", ".", "rs1"); + new SimpleMergeRSIDTest("rs1", "rs2", "rs3", "rs1,rs2,rs3"); + + return SimpleMergeRSIDTest.getTests(SimpleMergeRSIDTest.class); + } + + @Test(enabled = !DEBUG, dataProvider = "simplemergersiddata") + public void testRSIDMerge(SimpleMergeRSIDTest cfg) { + VariantContext snpVC1 = makeVC("snpvc1", Arrays.asList(Aref, T)); + final List inputs = new ArrayList(); + + for ( final String id : cfg.inputs ) { + inputs.add(new VariantContextBuilder(snpVC1).id(id).make()); + } + + final VariantContext merged = GATKVariantContextUtils.simpleMerge( + inputs, null, + GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, + GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, "set", false, false); + Assert.assertEquals(merged.getID(), cfg.expected); + } + + // -------------------------------------------------------------------------------- + // + // Test filtered merging + // + // -------------------------------------------------------------------------------- + + private class MergeFilteredTest extends TestDataProvider { + List inputs; + VariantContext expected; + String setExpected; + GATKVariantContextUtils.FilteredRecordMergeType type; + + + private MergeFilteredTest(String name, VariantContext input1, VariantContext input2, VariantContext expected, String setExpected) { + this(name, input1, input2, expected, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, setExpected); + } + + private MergeFilteredTest(String name, VariantContext input1, VariantContext input2, VariantContext expected, GATKVariantContextUtils.FilteredRecordMergeType type, String setExpected) { + super(MergeFilteredTest.class, name); + LinkedList all = new LinkedList(Arrays.asList(input1, input2)); + this.expected = expected; + this.type = type; + inputs = all; + this.setExpected = setExpected; + } + + public String toString() { + return String.format("%s input=%s expected=%s", super.toString(), inputs, expected); + } + } + + @DataProvider(name = "mergeFiltered") + public Object[][] mergeFilteredData() { + new MergeFilteredTest("AllPass", + makeVC("1", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), + makeVC("2", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), + makeVC("3", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), + GATKVariantContextUtils.MERGE_INTERSECTION); + + new MergeFilteredTest("noFilters", + makeVC("1", Arrays.asList(Aref, T), "."), + makeVC("2", Arrays.asList(Aref, T), "."), + makeVC("3", Arrays.asList(Aref, T), "."), + GATKVariantContextUtils.MERGE_INTERSECTION); + + new MergeFilteredTest("oneFiltered", + makeVC("1", Arrays.asList(Aref, T), "."), + makeVC("2", Arrays.asList(Aref, T), "FAIL"), + makeVC("3", Arrays.asList(Aref, T), "."), + String.format("1-%s2", GATKVariantContextUtils.MERGE_FILTER_PREFIX)); + + new MergeFilteredTest("onePassOneFail", + makeVC("1", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), + makeVC("2", Arrays.asList(Aref, T), "FAIL"), + makeVC("3", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), + String.format("1-%s2", GATKVariantContextUtils.MERGE_FILTER_PREFIX)); + + new MergeFilteredTest("AllFiltered", + makeVC("1", Arrays.asList(Aref, T), "FAIL"), + makeVC("2", Arrays.asList(Aref, T), "FAIL"), + makeVC("3", Arrays.asList(Aref, T), "FAIL"), + GATKVariantContextUtils.MERGE_FILTER_IN_ALL); + + // test ALL vs. ANY + new MergeFilteredTest("FailOneUnfiltered", + makeVC("1", Arrays.asList(Aref, T), "FAIL"), + makeVC("2", Arrays.asList(Aref, T), "."), + makeVC("3", Arrays.asList(Aref, T), "."), + GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, + String.format("%s1-2", GATKVariantContextUtils.MERGE_FILTER_PREFIX)); + + new MergeFilteredTest("OneFailAllUnfilteredArg", + makeVC("1", Arrays.asList(Aref, T), "FAIL"), + makeVC("2", Arrays.asList(Aref, T), "."), + makeVC("3", Arrays.asList(Aref, T), "FAIL"), + GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ALL_UNFILTERED, + String.format("%s1-2", GATKVariantContextUtils.MERGE_FILTER_PREFIX)); + + // test excluding allele in filtered record + new MergeFilteredTest("DontIncludeAlleleOfFilteredRecords", + makeVC("1", Arrays.asList(Aref, T), "."), + makeVC("2", Arrays.asList(Aref, T), "FAIL"), + makeVC("3", Arrays.asList(Aref, T), "."), + String.format("1-%s2", GATKVariantContextUtils.MERGE_FILTER_PREFIX)); + + // promotion of site from unfiltered to PASSES + new MergeFilteredTest("UnfilteredPlusPassIsPass", + makeVC("1", Arrays.asList(Aref, T), "."), + makeVC("2", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), + makeVC("3", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), + GATKVariantContextUtils.MERGE_INTERSECTION); + + new MergeFilteredTest("RefInAll", + makeVC("1", Arrays.asList(Aref), VariantContext.PASSES_FILTERS), + makeVC("2", Arrays.asList(Aref), VariantContext.PASSES_FILTERS), + makeVC("3", Arrays.asList(Aref), VariantContext.PASSES_FILTERS), + GATKVariantContextUtils.MERGE_REF_IN_ALL); + + new MergeFilteredTest("RefInOne", + makeVC("1", Arrays.asList(Aref), VariantContext.PASSES_FILTERS), + makeVC("2", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), + makeVC("3", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), + "2"); + + return MergeFilteredTest.getTests(MergeFilteredTest.class); + } + + @Test(enabled = !DEBUG, dataProvider = "mergeFiltered") + public void testMergeFiltered(MergeFilteredTest cfg) { + final List priority = vcs2priority(cfg.inputs); + final VariantContext merged = GATKVariantContextUtils.simpleMerge( + cfg.inputs, priority, cfg.type, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false); + + // test alleles are equal + Assert.assertEquals(merged.getAlleles(), cfg.expected.getAlleles()); + + // test set field + Assert.assertEquals(merged.getAttribute("set"), cfg.setExpected); + + // test filter field + Assert.assertEquals(merged.getFilters(), cfg.expected.getFilters()); + } + + // -------------------------------------------------------------------------------- + // + // Test genotype merging + // + // -------------------------------------------------------------------------------- + + private class MergeGenotypesTest extends TestDataProvider { + List inputs; + VariantContext expected; + List priority; + + private MergeGenotypesTest(String name, String priority, VariantContext... arg) { + super(MergeGenotypesTest.class, name); + LinkedList all = new LinkedList(Arrays.asList(arg)); + this.expected = all.pollLast(); + inputs = all; + this.priority = Arrays.asList(priority.split(",")); + } + + public String toString() { + return String.format("%s input=%s expected=%s", super.toString(), inputs, expected); + } + } + + @DataProvider(name = "mergeGenotypes") + public Object[][] mergeGenotypesData() { + new MergeGenotypesTest("TakeGenotypeByPriority-1,2", "1,2", + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), + makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)), + makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1))); + + new MergeGenotypesTest("TakeGenotypeByPriority-1,2-nocall", "1,2", + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, -1)), + makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)), + makeVC("3", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, -1))); + + new MergeGenotypesTest("TakeGenotypeByPriority-2,1", "2,1", + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), + makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)), + makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2))); + + new MergeGenotypesTest("NonOverlappingGenotypes", "1,2", + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), + makeVC("2", Arrays.asList(Aref, T), makeG("s2", Aref, T, -2)), + makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1), makeG("s2", Aref, T, -2))); + + new MergeGenotypesTest("PreserveNoCall", "1,2", + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, -1)), + makeVC("2", Arrays.asList(Aref, T), makeG("s2", Aref, T, -2)), + makeVC("3", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, -1), makeG("s2", Aref, T, -2))); + + new MergeGenotypesTest("PerserveAlleles", "1,2", + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), + makeVC("2", Arrays.asList(Aref, C), makeG("s2", Aref, C, -2)), + makeVC("3", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, -1), makeG("s2", Aref, C, -2))); + + new MergeGenotypesTest("TakeGenotypePartialOverlap-1,2", "1,2", + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), + makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2), makeG("s3", Aref, T, -3)), + makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1), makeG("s3", Aref, T, -3))); + + new MergeGenotypesTest("TakeGenotypePartialOverlap-2,1", "2,1", + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), + makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2), makeG("s3", Aref, T, -3)), + makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2), makeG("s3", Aref, T, -3))); + + // + // merging genothpes with PLs + // + + // first, do no harm + new MergeGenotypesTest("OrderedPLs", "1", + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1, 1, 2, 3)), + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1, 1, 2, 3))); + + // first, do no harm + new MergeGenotypesTest("OrderedPLs-3Alleles", "1", + makeVC("1", Arrays.asList(Aref, C, T), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6)), + makeVC("1", Arrays.asList(Aref, C, T), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6))); + + // first, do no harm + new MergeGenotypesTest("OrderedPLs-3Alleles-2", "1", + makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6)), + makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6))); + + // first, do no harm + new MergeGenotypesTest("OrderedPLs-3Alleles-2", "1", + makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6)), + makeVC("1", Arrays.asList(Aref, T, C), makeG("s2", Aref, C, -1, 1, 2, 3, 4, 5, 6)), + makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6), makeG("s2", Aref, C, -1, 1, 2, 3, 4, 5, 6))); + + new MergeGenotypesTest("TakeGenotypePartialOverlapWithPLs-2,1", "2,1", + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1,5,0,3)), + makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2,4,0,2), makeG("s3", Aref, T, -3,3,0,2)), + makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2,4,0,2), makeG("s3", Aref, T, -3,3,0,2))); + + new MergeGenotypesTest("TakeGenotypePartialOverlapWithPLs-1,2", "1,2", + makeVC("1", Arrays.asList(Aref,ATC), makeG("s1", Aref, ATC, -1,5,0,3)), + makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2,4,0,2), makeG("s3", Aref, T, -3,3,0,2)), + // no likelihoods on result since type changes to mixed multiallelic + makeVC("3", Arrays.asList(Aref, ATC, T), makeG("s1", Aref, ATC, -1), makeG("s3", Aref, T, -3))); + + new MergeGenotypesTest("MultipleSamplePLsDifferentOrder", "1,2", + makeVC("1", Arrays.asList(Aref, C, T), makeG("s1", Aref, C, -1, 1, 2, 3, 4, 5, 6)), + makeVC("2", Arrays.asList(Aref, T, C), makeG("s2", Aref, T, -2, 6, 5, 4, 3, 2, 1)), + // no likelihoods on result since type changes to mixed multiallelic + makeVC("3", Arrays.asList(Aref, C, T), makeG("s1", Aref, C, -1), makeG("s2", Aref, T, -2))); + + return MergeGenotypesTest.getTests(MergeGenotypesTest.class); + } + + @Test(enabled = !DEBUG, dataProvider = "mergeGenotypes") + public void testMergeGenotypes(MergeGenotypesTest cfg) { + final VariantContext merged = GATKVariantContextUtils.simpleMerge( + cfg.inputs, cfg.priority, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, + GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false); + + // test alleles are equal + Assert.assertEquals(merged.getAlleles(), cfg.expected.getAlleles()); + + // test genotypes + assertGenotypesAreMostlyEqual(merged.getGenotypes(), cfg.expected.getGenotypes()); + } + + // necessary to not overload equals for genotypes + private void assertGenotypesAreMostlyEqual(GenotypesContext actual, GenotypesContext expected) { + if (actual == expected) { + return; + } + + if (actual == null || expected == null) { + Assert.fail("Maps not equal: expected: " + expected + " and actual: " + actual); + } + + if (actual.size() != expected.size()) { + Assert.fail("Maps do not have the same size:" + actual.size() + " != " + expected.size()); + } + + for (Genotype value : actual) { + Genotype expectedValue = expected.get(value.getSampleName()); + + Assert.assertEquals(value.getAlleles(), expectedValue.getAlleles(), "Alleles in Genotype aren't equal"); + Assert.assertEquals(value.getGQ(), expectedValue.getGQ(), "GQ values aren't equal"); + Assert.assertEquals(value.hasLikelihoods(), expectedValue.hasLikelihoods(), "Either both have likelihoods or both not"); + if ( value.hasLikelihoods() ) + Assert.assertEquals(value.getLikelihoods().getAsVector(), expectedValue.getLikelihoods().getAsVector(), "Genotype likelihoods aren't equal"); + } + } + + @Test(enabled = !DEBUG) + public void testMergeGenotypesUniquify() { + final VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)); + final VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)); + + final VariantContext merged = GATKVariantContextUtils.simpleMerge( + Arrays.asList(vc1, vc2), null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, + GATKVariantContextUtils.GenotypeMergeType.UNIQUIFY, false, false, "set", false, false); + + // test genotypes + Assert.assertEquals(merged.getSampleNames(), new HashSet<>(Arrays.asList("s1.1", "s1.2"))); + } + +// TODO: remove after testing +// @Test(expectedExceptions = IllegalStateException.class) +// public void testMergeGenotypesRequireUnique() { +// final VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)); +// final VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)); +// +// final VariantContext merged = VariantContextUtils.simpleMerge( +// Arrays.asList(vc1, vc2), null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, +// VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE, false, false, "set", false, false, false); +// } + + // -------------------------------------------------------------------------------- + // + // Misc. tests + // + // -------------------------------------------------------------------------------- + + @Test(enabled = !DEBUG) + public void testAnnotationSet() { + for ( final boolean annotate : Arrays.asList(true, false)) { + for ( final String set : Arrays.asList("set", "combine", "x")) { + final List priority = Arrays.asList("1", "2"); + VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS); + VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS); + + final VariantContext merged = GATKVariantContextUtils.simpleMerge( + Arrays.asList(vc1, vc2), priority, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, + GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, annotate, false, set, false, false); + + if ( annotate ) + Assert.assertEquals(merged.getAttribute(set), GATKVariantContextUtils.MERGE_INTERSECTION); + else + Assert.assertFalse(merged.hasAttribute(set)); + } + } + } + + private static final List vcs2priority(final Collection vcs) { + final List priority = new ArrayList<>(); + + for ( final VariantContext vc : vcs ) { + priority.add(vc.getSource()); + } + + return priority; + } + + // -------------------------------------------------------------------------------- + // + // basic allele clipping test + // + // -------------------------------------------------------------------------------- + + private class ReverseClippingPositionTestProvider extends TestDataProvider { + final String ref; + final List alleles = new ArrayList(); + final int expectedClip; + + private ReverseClippingPositionTestProvider(final int expectedClip, final String ref, final String... alleles) { + super(ReverseClippingPositionTestProvider.class); + this.ref = ref; + for ( final String allele : alleles ) + this.alleles.add(Allele.create(allele)); + this.expectedClip = expectedClip; + } + + @Override + public String toString() { + return String.format("ref=%s allele=%s reverse clip %d", ref, alleles, expectedClip); + } + } + + @DataProvider(name = "ReverseClippingPositionTestProvider") + public Object[][] makeReverseClippingPositionTestProvider() { + // pair clipping + new ReverseClippingPositionTestProvider(0, "ATT", "CCG"); + new ReverseClippingPositionTestProvider(1, "ATT", "CCT"); + new ReverseClippingPositionTestProvider(2, "ATT", "CTT"); + new ReverseClippingPositionTestProvider(2, "ATT", "ATT"); // cannot completely clip allele + + // triplets + new ReverseClippingPositionTestProvider(0, "ATT", "CTT", "CGG"); + new ReverseClippingPositionTestProvider(1, "ATT", "CTT", "CGT"); // the T can go + new ReverseClippingPositionTestProvider(2, "ATT", "CTT", "CTT"); // both Ts can go + + return ReverseClippingPositionTestProvider.getTests(ReverseClippingPositionTestProvider.class); + } + + @Test(enabled = !DEBUG, dataProvider = "ReverseClippingPositionTestProvider") + public void testReverseClippingPositionTestProvider(ReverseClippingPositionTestProvider cfg) { + int result = GATKVariantContextUtils.computeReverseClipping(cfg.alleles, cfg.ref.getBytes()); + Assert.assertEquals(result, cfg.expectedClip); + } + + + // -------------------------------------------------------------------------------- + // + // test splitting into bi-allelics + // + // -------------------------------------------------------------------------------- + + @DataProvider(name = "SplitBiallelics") + public Object[][] makeSplitBiallelics() throws CloneNotSupportedException { + List tests = new ArrayList(); + + final VariantContextBuilder root = new VariantContextBuilder("x", "20", 10, 10, Arrays.asList(Aref, C)); + + // biallelic -> biallelic + tests.add(new Object[]{root.make(), Arrays.asList(root.make())}); + + // monos -> monos + root.alleles(Arrays.asList(Aref)); + tests.add(new Object[]{root.make(), Arrays.asList(root.make())}); + + root.alleles(Arrays.asList(Aref, C, T)); + tests.add(new Object[]{root.make(), + Arrays.asList( + root.alleles(Arrays.asList(Aref, C)).make(), + root.alleles(Arrays.asList(Aref, T)).make())}); + + root.alleles(Arrays.asList(Aref, C, T, G)); + tests.add(new Object[]{root.make(), + Arrays.asList( + root.alleles(Arrays.asList(Aref, C)).make(), + root.alleles(Arrays.asList(Aref, T)).make(), + root.alleles(Arrays.asList(Aref, G)).make())}); + + final Allele C = Allele.create("C"); + final Allele CA = Allele.create("CA"); + final Allele CAA = Allele.create("CAA"); + final Allele CAAAA = Allele.create("CAAAA"); + final Allele CAAAAA = Allele.create("CAAAAA"); + final Allele Cref = Allele.create("C", true); + final Allele CAref = Allele.create("CA", true); + final Allele CAAref = Allele.create("CAA", true); + final Allele CAAAref = Allele.create("CAAA", true); + + root.alleles(Arrays.asList(Cref, CA, CAA)); + tests.add(new Object[]{root.make(), + Arrays.asList( + root.alleles(Arrays.asList(Cref, CA)).make(), + root.alleles(Arrays.asList(Cref, CAA)).make())}); + + root.alleles(Arrays.asList(CAAref, C, CA)).stop(12); + tests.add(new Object[]{root.make(), + Arrays.asList( + root.alleles(Arrays.asList(CAAref, C)).make(), + root.alleles(Arrays.asList(CAref, C)).stop(11).make())}); + + root.alleles(Arrays.asList(CAAAref, C, CA, CAA)).stop(13); + tests.add(new Object[]{root.make(), + Arrays.asList( + root.alleles(Arrays.asList(CAAAref, C)).make(), + root.alleles(Arrays.asList(CAAref, C)).stop(12).make(), + root.alleles(Arrays.asList(CAref, C)).stop(11).make())}); + + root.alleles(Arrays.asList(CAAAref, CAAAAA, CAAAA, CAA, C)).stop(13); + tests.add(new Object[]{root.make(), + Arrays.asList( + root.alleles(Arrays.asList(Cref, CAA)).stop(10).make(), + root.alleles(Arrays.asList(Cref, CA)).stop(10).make(), + root.alleles(Arrays.asList(CAref, C)).stop(11).make(), + root.alleles(Arrays.asList(CAAAref, C)).stop(13).make())}); + + final Allele threeCopies = Allele.create("GTTTTATTTTATTTTA", true); + final Allele twoCopies = Allele.create("GTTTTATTTTA", true); + final Allele zeroCopies = Allele.create("G", false); + final Allele oneCopies = Allele.create("GTTTTA", false); + tests.add(new Object[]{root.alleles(Arrays.asList(threeCopies, zeroCopies, oneCopies)).stop(25).make(), + Arrays.asList( + root.alleles(Arrays.asList(threeCopies, zeroCopies)).stop(25).make(), + root.alleles(Arrays.asList(twoCopies, zeroCopies)).stop(20).make())}); + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = !DEBUG, dataProvider = "SplitBiallelics") + public void testSplitBiallelicsNoGenotypes(final VariantContext vc, final List expectedBiallelics) { + final List biallelics = GATKVariantContextUtils.splitVariantContextToBiallelics(vc); + Assert.assertEquals(biallelics.size(), expectedBiallelics.size()); + for ( int i = 0; i < biallelics.size(); i++ ) { + final VariantContext actual = biallelics.get(i); + final VariantContext expected = expectedBiallelics.get(i); + assertVariantContextsAreEqual(actual, expected); + } + } + + @Test(enabled = !DEBUG, dataProvider = "SplitBiallelics", dependsOnMethods = "testSplitBiallelicsNoGenotypes") + public void testSplitBiallelicsGenotypes(final VariantContext vc, final List expectedBiallelics) { + final List genotypes = new ArrayList(); + + int sampleI = 0; + for ( final List alleles : Utils.makePermutations(vc.getAlleles(), 2, true) ) { + genotypes.add(GenotypeBuilder.create("sample" + sampleI++, alleles)); + } + genotypes.add(GenotypeBuilder.createMissing("missing", 2)); + + final VariantContext vcWithGenotypes = new VariantContextBuilder(vc).genotypes(genotypes).make(); + + final List biallelics = GATKVariantContextUtils.splitVariantContextToBiallelics(vcWithGenotypes); + for ( int i = 0; i < biallelics.size(); i++ ) { + final VariantContext actual = biallelics.get(i); + Assert.assertEquals(actual.getNSamples(), vcWithGenotypes.getNSamples()); // not dropping any samples + + for ( final Genotype inputGenotype : genotypes ) { + final Genotype actualGenotype = actual.getGenotype(inputGenotype.getSampleName()); + Assert.assertNotNull(actualGenotype); + if ( ! vc.isVariant() || vc.isBiallelic() ) + Assert.assertEquals(actualGenotype, vcWithGenotypes.getGenotype(inputGenotype.getSampleName())); + else + Assert.assertTrue(actualGenotype.isNoCall()); + } + } + } + + // -------------------------------------------------------------------------------- + // + // Test repeats + // + // -------------------------------------------------------------------------------- + + private class RepeatDetectorTest extends TestDataProvider { + String ref; + boolean isTrueRepeat; + VariantContext vc; + + private RepeatDetectorTest(boolean isTrueRepeat, String ref, String refAlleleString, String ... altAlleleStrings) { + super(RepeatDetectorTest.class); + this.isTrueRepeat = isTrueRepeat; + this.ref = ref; + + List alleles = new LinkedList(); + final Allele refAllele = Allele.create(refAlleleString, true); + alleles.add(refAllele); + for ( final String altString: altAlleleStrings) { + final Allele alt = Allele.create(altString, false); + alleles.add(alt); + } + + VariantContextBuilder builder = new VariantContextBuilder("test", "chr1", 1, refAllele.length(), alleles); + this.vc = builder.make(); + } + + public String toString() { + return String.format("%s refBases=%s trueRepeat=%b vc=%s", super.toString(), ref, isTrueRepeat, vc); + } + } + + @DataProvider(name = "RepeatDetectorTest") + public Object[][] makeRepeatDetectorTest() { + new RepeatDetectorTest(true, "NAAC", "N", "NA"); + new RepeatDetectorTest(true, "NAAC", "NA", "N"); + new RepeatDetectorTest(false, "NAAC", "NAA", "N"); + new RepeatDetectorTest(false, "NAAC", "N", "NC"); + new RepeatDetectorTest(false, "AAC", "A", "C"); + + // running out of ref bases => false + new RepeatDetectorTest(false, "NAAC", "N", "NCAGTA"); + + // complex repeats + new RepeatDetectorTest(true, "NATATATC", "N", "NAT"); + new RepeatDetectorTest(true, "NATATATC", "N", "NATA"); + new RepeatDetectorTest(true, "NATATATC", "N", "NATAT"); + new RepeatDetectorTest(true, "NATATATC", "NAT", "N"); + new RepeatDetectorTest(false, "NATATATC", "NATA", "N"); + new RepeatDetectorTest(false, "NATATATC", "NATAT", "N"); + + // multi-allelic + new RepeatDetectorTest(true, "NATATATC", "N", "NAT", "NATAT"); + new RepeatDetectorTest(true, "NATATATC", "N", "NAT", "NATA"); + new RepeatDetectorTest(true, "NATATATC", "NAT", "N", "NATAT"); + new RepeatDetectorTest(true, "NATATATC", "NAT", "N", "NATA"); // two As + new RepeatDetectorTest(false, "NATATATC", "NAT", "N", "NATC"); // false + new RepeatDetectorTest(false, "NATATATC", "NAT", "N", "NCC"); // false + new RepeatDetectorTest(false, "NATATATC", "NAT", "NATAT", "NCC"); // false + + return RepeatDetectorTest.getTests(RepeatDetectorTest.class); + } + + @Test(enabled = !DEBUG, dataProvider = "RepeatDetectorTest") + public void testRepeatDetectorTest(RepeatDetectorTest cfg) { + + // test alleles are equal + Assert.assertEquals(GATKVariantContextUtils.isTandemRepeat(cfg.vc, cfg.ref.getBytes()), cfg.isTrueRepeat); + } + + @Test(enabled = !DEBUG) + public void testRepeatAllele() { + Allele nullR = Allele.create("A", true); + Allele nullA = Allele.create("A", false); + Allele atc = Allele.create("AATC", false); + Allele atcatc = Allele.create("AATCATC", false); + Allele ccccR = Allele.create("ACCCC", true); + Allele cc = Allele.create("ACC", false); + Allele cccccc = Allele.create("ACCCCCC", false); + Allele gagaR = Allele.create("AGAGA", true); + Allele gagagaga = Allele.create("AGAGAGAGA", false); + + // - / ATC [ref] from 20-22 + String delLoc = "chr1"; + int delLocStart = 20; + int delLocStop = 22; + + // - [ref] / ATC from 20-20 + String insLoc = "chr1"; + int insLocStart = 20; + int insLocStop = 20; + + Pair,byte[]> result; + byte[] refBytes = "TATCATCATCGGA".getBytes(); + + Assert.assertEquals(GATKVariantContextUtils.findNumberofRepetitions("ATG".getBytes(), "ATGATGATGATG".getBytes(), true),4); + Assert.assertEquals(GATKVariantContextUtils.findNumberofRepetitions("G".getBytes(), "ATGATGATGATG".getBytes(), true),0); + Assert.assertEquals(GATKVariantContextUtils.findNumberofRepetitions("T".getBytes(), "T".getBytes(), true),1); + Assert.assertEquals(GATKVariantContextUtils.findNumberofRepetitions("AT".getBytes(), "ATGATGATCATG".getBytes(), true),1); + Assert.assertEquals(GATKVariantContextUtils.findNumberofRepetitions("CCC".getBytes(), "CCCCCCCC".getBytes(), true),2); + + Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("ATG".getBytes()),3); + Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("AAA".getBytes()),1); + Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("CACACAC".getBytes()),7); + Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("CACACA".getBytes()),2); + Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("CATGCATG".getBytes()),4); + Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("AATAATA".getBytes()),7); + + + // A*,ATC, context = ATC ATC ATC : (ATC)3 -> (ATC)4 + VariantContext vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStop, Arrays.asList(nullR,atc)).make(); + result = GATKVariantContextUtils.getNumTandemRepeatUnits(vc, refBytes); + Assert.assertEquals(result.getFirst().toArray()[0],3); + Assert.assertEquals(result.getFirst().toArray()[1],4); + Assert.assertEquals(result.getSecond().length,3); + + // ATC*,A,ATCATC + vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStart+3, Arrays.asList(Allele.create("AATC", true),nullA,atcatc)).make(); + result = GATKVariantContextUtils.getNumTandemRepeatUnits(vc, refBytes); + Assert.assertEquals(result.getFirst().toArray()[0],3); + Assert.assertEquals(result.getFirst().toArray()[1],2); + Assert.assertEquals(result.getFirst().toArray()[2],4); + Assert.assertEquals(result.getSecond().length,3); + + // simple non-tandem deletion: CCCC*, - + refBytes = "TCCCCCCCCATG".getBytes(); + vc = new VariantContextBuilder("foo", delLoc, 10, 14, Arrays.asList(ccccR,nullA)).make(); + result = GATKVariantContextUtils.getNumTandemRepeatUnits(vc, refBytes); + Assert.assertEquals(result.getFirst().toArray()[0],8); + Assert.assertEquals(result.getFirst().toArray()[1],4); + Assert.assertEquals(result.getSecond().length,1); + + // CCCC*,CC,-,CCCCCC, context = CCC: (C)7 -> (C)5,(C)3,(C)9 + refBytes = "TCCCCCCCAGAGAGAG".getBytes(); + vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStart+4, Arrays.asList(ccccR,cc, nullA,cccccc)).make(); + result = GATKVariantContextUtils.getNumTandemRepeatUnits(vc, refBytes); + Assert.assertEquals(result.getFirst().toArray()[0],7); + Assert.assertEquals(result.getFirst().toArray()[1],5); + Assert.assertEquals(result.getFirst().toArray()[2],3); + Assert.assertEquals(result.getFirst().toArray()[3],9); + Assert.assertEquals(result.getSecond().length,1); + + // GAGA*,-,GAGAGAGA + refBytes = "TGAGAGAGAGATTT".getBytes(); + vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStart+4, Arrays.asList(gagaR, nullA,gagagaga)).make(); + result = GATKVariantContextUtils.getNumTandemRepeatUnits(vc, refBytes); + Assert.assertEquals(result.getFirst().toArray()[0],5); + Assert.assertEquals(result.getFirst().toArray()[1],3); + Assert.assertEquals(result.getFirst().toArray()[2],7); + Assert.assertEquals(result.getSecond().length,2); + + } + + // -------------------------------------------------------------------------------- + // + // test forward clipping + // + // -------------------------------------------------------------------------------- + + @DataProvider(name = "ForwardClippingData") + public Object[][] makeForwardClippingData() { + List tests = new ArrayList(); + + // this functionality can be adapted to provide input data for whatever you might want in your data + tests.add(new Object[]{Arrays.asList("A"), -1}); + tests.add(new Object[]{Arrays.asList(""), -1}); + tests.add(new Object[]{Arrays.asList("A", "C"), -1}); + tests.add(new Object[]{Arrays.asList("AC", "C"), -1}); + tests.add(new Object[]{Arrays.asList("A", "G"), -1}); + tests.add(new Object[]{Arrays.asList("A", "T"), -1}); + tests.add(new Object[]{Arrays.asList("GT", "CA"), -1}); + tests.add(new Object[]{Arrays.asList("GT", "CT"), -1}); + tests.add(new Object[]{Arrays.asList("ACC", "AC"), 0}); + tests.add(new Object[]{Arrays.asList("ACGC", "ACG"), 1}); + tests.add(new Object[]{Arrays.asList("ACGC", "ACG"), 1}); + tests.add(new Object[]{Arrays.asList("ACGC", "ACGA"), 2}); + tests.add(new Object[]{Arrays.asList("ACGC", "AGC"), 0}); + tests.add(new Object[]{Arrays.asList("A", ""), -1}); + for ( int len = 0; len < 50; len++ ) + tests.add(new Object[]{Arrays.asList("A" + new String(Utils.dupBytes((byte)'C', len)), "C"), -1}); + + tests.add(new Object[]{Arrays.asList("A", "T", "C"), -1}); + tests.add(new Object[]{Arrays.asList("AT", "AC", "AG"), 0}); + tests.add(new Object[]{Arrays.asList("AT", "AC", "A"), -1}); + tests.add(new Object[]{Arrays.asList("AT", "AC", "ACG"), 0}); + tests.add(new Object[]{Arrays.asList("AC", "AC", "ACG"), 0}); + tests.add(new Object[]{Arrays.asList("AC", "ACT", "ACG"), 0}); + tests.add(new Object[]{Arrays.asList("ACG", "ACGT", "ACGTA"), 1}); + tests.add(new Object[]{Arrays.asList("ACG", "ACGT", "ACGCA"), 1}); + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = !DEBUG, dataProvider = "ForwardClippingData") + public void testForwardClipping(final List alleleStrings, final int expectedClip) { + final List alleles = new LinkedList(); + for ( final String alleleString : alleleStrings ) + alleles.add(Allele.create(alleleString)); + + for ( final List myAlleles : Utils.makePermutations(alleles, alleles.size(), false)) { + final int actual = GATKVariantContextUtils.computeForwardClipping(myAlleles); + Assert.assertEquals(actual, expectedClip); + } + } + + @DataProvider(name = "ClipAlleleTest") + public Object[][] makeClipAlleleTest() { + List tests = new ArrayList(); + + // this functionality can be adapted to provide input data for whatever you might want in your data + tests.add(new Object[]{Arrays.asList("ACC", "AC"), Arrays.asList("AC", "A"), 0}); + tests.add(new Object[]{Arrays.asList("ACGC", "ACG"), Arrays.asList("GC", "G"), 2}); + tests.add(new Object[]{Arrays.asList("ACGC", "ACGA"), Arrays.asList("C", "A"), 3}); + tests.add(new Object[]{Arrays.asList("ACGC", "AGC"), Arrays.asList("AC", "A"), 0}); + tests.add(new Object[]{Arrays.asList("AT", "AC", "AG"), Arrays.asList("T", "C", "G"), 1}); + tests.add(new Object[]{Arrays.asList("AT", "AC", "ACG"), Arrays.asList("T", "C", "CG"), 1}); + tests.add(new Object[]{Arrays.asList("AC", "ACT", "ACG"), Arrays.asList("C", "CT", "CG"), 1}); + tests.add(new Object[]{Arrays.asList("ACG", "ACGT", "ACGTA"), Arrays.asList("G", "GT", "GTA"), 2}); + tests.add(new Object[]{Arrays.asList("ACG", "ACGT", "ACGCA"), Arrays.asList("G", "GT", "GCA"), 2}); + + // trims from left and right + tests.add(new Object[]{Arrays.asList("ACGTT", "ACCTT"), Arrays.asList("G", "C"), 2}); + tests.add(new Object[]{Arrays.asList("ACGTT", "ACCCTT"), Arrays.asList("G", "CC"), 2}); + tests.add(new Object[]{Arrays.asList("ACGTT", "ACGCTT"), Arrays.asList("G", "GC"), 2}); + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = !DEBUG, dataProvider = "ClipAlleleTest") + public void testClipAlleles(final List alleleStrings, final List expected, final int numLeftClipped) { + final int start = 10; + final VariantContext unclipped = GATKVariantContextUtils.makeFromAlleles("test", "20", start, alleleStrings); + final VariantContext clipped = GATKVariantContextUtils.trimAlleles(unclipped, true, true); + + Assert.assertEquals(clipped.getStart(), unclipped.getStart() + numLeftClipped); + for ( int i = 0; i < unclipped.getAlleles().size(); i++ ) { + final Allele trimmed = clipped.getAlleles().get(i); + Assert.assertEquals(trimmed.getBaseString(), expected.get(i)); + } + } + + // -------------------------------------------------------------------------------- + // + // test primitive allele splitting + // + // -------------------------------------------------------------------------------- + + @DataProvider(name = "PrimitiveAlleleSplittingData") + public Object[][] makePrimitiveAlleleSplittingData() { + List tests = new ArrayList<>(); + + // no split + tests.add(new Object[]{"A", "C", 0, null}); + tests.add(new Object[]{"A", "AC", 0, null}); + tests.add(new Object[]{"AC", "A", 0, null}); + + // one split + tests.add(new Object[]{"ACA", "GCA", 1, Arrays.asList(0)}); + tests.add(new Object[]{"ACA", "AGA", 1, Arrays.asList(1)}); + tests.add(new Object[]{"ACA", "ACG", 1, Arrays.asList(2)}); + + // two splits + tests.add(new Object[]{"ACA", "GGA", 2, Arrays.asList(0, 1)}); + tests.add(new Object[]{"ACA", "GCG", 2, Arrays.asList(0, 2)}); + tests.add(new Object[]{"ACA", "AGG", 2, Arrays.asList(1, 2)}); + + // three splits + tests.add(new Object[]{"ACA", "GGG", 3, Arrays.asList(0, 1, 2)}); + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = !DEBUG, dataProvider = "PrimitiveAlleleSplittingData") + public void testPrimitiveAlleleSplitting(final String ref, final String alt, final int expectedSplit, final List variantPositions) { + + final int start = 10; + final VariantContext vc = GATKVariantContextUtils.makeFromAlleles("test", "20", start, Arrays.asList(ref, alt)); + + final List result = GATKVariantContextUtils.splitIntoPrimitiveAlleles(vc); + + if ( expectedSplit > 0 ) { + Assert.assertEquals(result.size(), expectedSplit); + for ( int i = 0; i < variantPositions.size(); i++ ) { + Assert.assertEquals(result.get(i).getStart(), start + variantPositions.get(i)); + } + } else { + Assert.assertEquals(result.size(), 1); + Assert.assertEquals(vc, result.get(0)); + } + } + + // -------------------------------------------------------------------------------- + // + // test allele remapping + // + // -------------------------------------------------------------------------------- + + @DataProvider(name = "AlleleRemappingData") + public Object[][] makeAlleleRemappingData() { + List tests = new ArrayList<>(); + + final Allele originalBase1 = Allele.create((byte)'A'); + final Allele originalBase2 = Allele.create((byte)'T'); + + for ( final byte base1 : BaseUtils.BASES ) { + for ( final byte base2 : BaseUtils.BASES ) { + for ( final int numGenotypes : Arrays.asList(0, 1, 2, 5) ) { + Map map = new HashMap<>(2); + map.put(originalBase1, Allele.create(base1)); + map.put(originalBase2, Allele.create(base2)); + + tests.add(new Object[]{map, numGenotypes}); + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = !DEBUG, dataProvider = "AlleleRemappingData") + public void testAlleleRemapping(final Map alleleMap, final int numGenotypes) { + + final GATKVariantContextUtils.AlleleMapper alleleMapper = new GATKVariantContextUtils.AlleleMapper(alleleMap); + + final GenotypesContext originalGC = createGenotypesContext(numGenotypes, new ArrayList(alleleMap.keySet())); + + final GenotypesContext remappedGC = GATKVariantContextUtils.updateGenotypesWithMappedAlleles(originalGC, alleleMapper); + + for ( int i = 0; i < numGenotypes; i++ ) { + final Genotype originalG = originalGC.get(String.format("%d", i)); + final Genotype remappedG = remappedGC.get(String.format("%d", i)); + + Assert.assertEquals(originalG.getAlleles().size(), remappedG.getAlleles().size()); + for ( int j = 0; j < originalG.getAlleles().size(); j++ ) + Assert.assertEquals(remappedG.getAllele(j), alleleMap.get(originalG.getAllele(j))); + } + } + + private static GenotypesContext createGenotypesContext(final int numGenotypes, final List alleles) { + GenomeAnalysisEngine.resetRandomGenerator(); + final Random random = GenomeAnalysisEngine.getRandomGenerator(); + + final GenotypesContext gc = GenotypesContext.create(); + for ( int i = 0; i < numGenotypes; i++ ) { + // choose alleles at random + final List myAlleles = new ArrayList(); + myAlleles.add(alleles.get(random.nextInt(2))); + myAlleles.add(alleles.get(random.nextInt(2))); + + final Genotype g = new GenotypeBuilder(String.format("%d", i)).alleles(myAlleles).make(); + gc.add(g); + } + + return gc; + } + + // -------------------------------------------------------------------------------- + // + // Test subsetDiploidAlleles + // + // -------------------------------------------------------------------------------- + + @DataProvider(name = "subsetDiploidAllelesData") + public Object[][] makesubsetDiploidAllelesData() { + List tests = new ArrayList<>(); + + final Allele A = Allele.create("A", true); + final Allele C = Allele.create("C"); + final Allele G = Allele.create("G"); + + final List AA = Arrays.asList(A,A); + final List AC = Arrays.asList(A,C); + final List CC = Arrays.asList(C,C); + final List AG = Arrays.asList(A,G); + final List CG = Arrays.asList(C,G); + final List GG = Arrays.asList(G,G); + final List ACG = Arrays.asList(A,C,G); + + final VariantContext vcBase = new VariantContextBuilder("test", "20", 10, 10, AC).make(); + + final double[] homRefPL = MathUtils.normalizeFromRealSpace(new double[]{0.9, 0.09, 0.01}); + final double[] hetPL = MathUtils.normalizeFromRealSpace(new double[]{0.09, 0.9, 0.01}); + final double[] homVarPL = MathUtils.normalizeFromRealSpace(new double[]{0.01, 0.09, 0.9}); + final double[] uninformative = new double[]{0, 0, 0}; + + final Genotype base = new GenotypeBuilder("NA12878").DP(10).GQ(50).make(); + + // make sure we don't screw up the simple case + final Genotype aaGT = new GenotypeBuilder(base).alleles(AA).AD(new int[]{10,2}).PL(homRefPL).GQ(8).make(); + final Genotype acGT = new GenotypeBuilder(base).alleles(AC).AD(new int[]{10,2}).PL(hetPL).GQ(8).make(); + final Genotype ccGT = new GenotypeBuilder(base).alleles(CC).AD(new int[]{10,2}).PL(homVarPL).GQ(8).make(); + + tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(aaGT).make(), AC, Arrays.asList(new GenotypeBuilder(aaGT).make())}); + tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(acGT).make(), AC, Arrays.asList(new GenotypeBuilder(acGT).make())}); + tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(ccGT).make(), AC, Arrays.asList(new GenotypeBuilder(ccGT).make())}); + + // uninformative test case + final Genotype uninformativeGT = new GenotypeBuilder(base).alleles(CC).PL(uninformative).GQ(0).make(); + final Genotype emptyGT = new GenotypeBuilder(base).alleles(GATKVariantContextUtils.NO_CALL_ALLELES).noPL().noGQ().make(); + tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(uninformativeGT).make(), AC, Arrays.asList(emptyGT)}); + + // actually subsetting down from multiple alt values + final double[] homRef3AllelesPL = new double[]{0, -10, -20, -30, -40, -50}; + final double[] hetRefC3AllelesPL = new double[]{-10, 0, -20, -30, -40, -50}; + final double[] homC3AllelesPL = new double[]{-20, -10, 0, -30, -40, -50}; + final double[] hetRefG3AllelesPL = new double[]{-20, -10, -30, 0, -40, -50}; + final double[] hetCG3AllelesPL = new double[]{-20, -10, -30, -40, 0, -50}; // AA, AC, CC, AG, CG, GG + final double[] homG3AllelesPL = new double[]{-20, -10, -30, -40, -50, 0}; // AA, AC, CC, AG, CG, GG + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).PL(homRef3AllelesPL).make()).make(), + AC, + Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{0, -10, -20}).GQ(100).make())}); + + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).PL(hetRefC3AllelesPL).make()).make(), + AC, + Arrays.asList(new GenotypeBuilder(base).alleles(AC).PL(new double[]{-10, 0, -20}).GQ(100).make())}); + + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).PL(homC3AllelesPL).make()).make(), + AC, + Arrays.asList(new GenotypeBuilder(base).alleles(CC).PL(new double[]{-20, -10, 0}).GQ(100).make())}); + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).PL(hetRefG3AllelesPL).make()).make(), + AG, + Arrays.asList(new GenotypeBuilder(base).alleles(AG).PL(new double[]{-20, 0, -50}).GQ(200).make())}); + + // wow, scary -- bad output but discussed with Eric and we think this is the only thing that can be done + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).PL(hetCG3AllelesPL).make()).make(), + AG, + Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{0, -20, -30}).GQ(200).make())}); + + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).PL(homG3AllelesPL).make()).make(), + AG, + Arrays.asList(new GenotypeBuilder(base).alleles(GG).PL(new double[]{-20, -40, 0}).GQ(200).make())}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "subsetDiploidAllelesData") + public void testsubsetDiploidAllelesData(final VariantContext inputVC, + final List allelesToUse, + final List expectedGenotypes) { + final GenotypesContext actual = GATKVariantContextUtils.subsetDiploidAlleles(inputVC, allelesToUse, GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN); + + Assert.assertEquals(actual.size(), expectedGenotypes.size()); + for ( final Genotype expected : expectedGenotypes ) { + final Genotype actualGT = actual.get(expected.getSampleName()); + Assert.assertNotNull(actualGT); + assertGenotypesAreEqual(actualGT, expected); + } + } + + @DataProvider(name = "UpdateGenotypeAfterSubsettingData") + public Object[][] makeUpdateGenotypeAfterSubsettingData() { + List tests = new ArrayList(); + + final Allele A = Allele.create("A", true); + final Allele C = Allele.create("C"); + final Allele G = Allele.create("G"); + + final List AA = Arrays.asList(A,A); + final List AC = Arrays.asList(A,C); + final List CC = Arrays.asList(C,C); + final List AG = Arrays.asList(A,G); + final List CG = Arrays.asList(C,G); + final List GG = Arrays.asList(G,G); + final List ACG = Arrays.asList(A,C,G); + final List> allSubsetAlleles = Arrays.asList(AC,AG,ACG); + + final double[] homRefPL = new double[]{0.9, 0.09, 0.01}; + final double[] hetPL = new double[]{0.09, 0.9, 0.01}; + final double[] homVarPL = new double[]{0.01, 0.09, 0.9}; + final double[] uninformative = new double[]{0.33, 0.33, 0.33}; + final List allPLs = Arrays.asList(homRefPL, hetPL, homVarPL, uninformative); + + for ( final List alleles : allSubsetAlleles ) { + for ( final double[] pls : allPLs ) { + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL, pls, AA, alleles, GATKVariantContextUtils.NO_CALL_ALLELES}); + } + } + + for ( final List originalGT : Arrays.asList(AA, AC, CC, AG, CG, GG) ) { + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, homRefPL, originalGT, AC, AA}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, hetPL, originalGT, AC, AC}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, homVarPL, originalGT, AC, CC}); +// tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, uninformative, AA, AC, GATKVariantContextUtils.NO_CALL_ALLELES}); + } + + for ( final double[] pls : allPLs ) { + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AA, AC, AA}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AC, AC, AC}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CC, AC, CC}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CG, AC, AC}); + + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AA, AG, AA}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AC, AG, AA}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CC, AG, AA}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CG, AG, AG}); + + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AA, ACG, AA}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AC, ACG, AC}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CC, ACG, CC}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AG, ACG, AG}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CG, ACG, CG}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, GG, ACG, GG}); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = !DEBUG, dataProvider = "UpdateGenotypeAfterSubsettingData") + public void testUpdateGenotypeAfterSubsetting(final GATKVariantContextUtils.GenotypeAssignmentMethod mode, + final double[] likelihoods, + final List originalGT, + final List allelesToUse, + final List expectedAlleles) { + final GenotypeBuilder gb = new GenotypeBuilder("test"); + final double[] log10Likelhoods = MathUtils.normalizeFromLog10(likelihoods, true, false); + GATKVariantContextUtils.updateGenotypeAfterSubsetting(originalGT, gb, mode, log10Likelhoods, allelesToUse); + final Genotype g = gb.make(); + Assert.assertEquals(new HashSet<>(g.getAlleles()), new HashSet<>(expectedAlleles)); + } + + @Test(enabled = !DEBUG) + public void testSubsetToRef() { + final Map tests = new LinkedHashMap<>(); + + for ( final List alleles : Arrays.asList(Arrays.asList(Aref), Arrays.asList(C), Arrays.asList(Aref, C), Arrays.asList(Aref, C, C) ) ) { + for ( final String name : Arrays.asList("test1", "test2") ) { + final GenotypeBuilder builder = new GenotypeBuilder(name, alleles); + builder.DP(10); + builder.GQ(30); + builder.AD(alleles.size() == 1 ? new int[]{1} : (alleles.size() == 2 ? new int[]{1, 2} : new int[]{1, 2, 3})); + builder.PL(alleles.size() == 1 ? new int[]{1} : (alleles.size() == 2 ? new int[]{1,2} : new int[]{1,2,3})); + final List refs = Collections.nCopies(alleles.size(), Aref); + tests.put(builder.make(), builder.alleles(refs).noAD().noPL().make()); + } + } + + for ( final int n : Arrays.asList(1, 2, 3) ) { + for ( final List genotypes : Utils.makePermutations(new ArrayList<>(tests.keySet()), n, false) ) { + final VariantContext vc = new VariantContextBuilder("test", "20", 1, 1, Arrays.asList(Aref, C)).genotypes(genotypes).make(); + final GenotypesContext gc = GATKVariantContextUtils.subsetToRefOnly(vc, 2); + + Assert.assertEquals(gc.size(), genotypes.size()); + for ( int i = 0; i < genotypes.size(); i++ ) { +// logger.warn("Testing " + genotypes.get(i) + " => " + gc.get(i) + " " + tests.get(genotypes.get(i))); + assertGenotypesAreEqual(gc.get(i), tests.get(genotypes.get(i))); + } + } + } + } + + // -------------------------------------------------------------------------------- + // + // Test updatePLsAndAD + // + // -------------------------------------------------------------------------------- + + @DataProvider(name = "updatePLsAndADData") + public Object[][] makeUpdatePLsAndADData() { + List tests = new ArrayList<>(); + + final Allele A = Allele.create("A", true); + final Allele C = Allele.create("C"); + final Allele G = Allele.create("G"); + + final List AA = Arrays.asList(A,A); + final List AC = Arrays.asList(A,C); + final List CC = Arrays.asList(C,C); + final List AG = Arrays.asList(A,G); + final List CG = Arrays.asList(C,G); + final List GG = Arrays.asList(G,G); + final List ACG = Arrays.asList(A,C,G); + + final VariantContext vcBase = new VariantContextBuilder("test", "20", 10, 10, AC).make(); + + final double[] homRefPL = MathUtils.normalizeFromRealSpace(new double[]{0.9, 0.09, 0.01}); + final double[] hetPL = MathUtils.normalizeFromRealSpace(new double[]{0.09, 0.9, 0.01}); + final double[] homVarPL = MathUtils.normalizeFromRealSpace(new double[]{0.01, 0.09, 0.9}); + final double[] uninformative = new double[]{0, 0, 0}; + + final Genotype base = new GenotypeBuilder("NA12878").DP(10).GQ(100).make(); + + // make sure we don't screw up the simple case where no selection happens + final Genotype aaGT = new GenotypeBuilder(base).alleles(AA).AD(new int[]{10,2}).PL(homRefPL).GQ(8).make(); + final Genotype acGT = new GenotypeBuilder(base).alleles(AC).AD(new int[]{10,2}).PL(hetPL).GQ(8).make(); + final Genotype ccGT = new GenotypeBuilder(base).alleles(CC).AD(new int[]{10,2}).PL(homVarPL).GQ(8).make(); + + tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(aaGT).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), Arrays.asList(new GenotypeBuilder(aaGT).make())}); + tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(acGT).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), Arrays.asList(new GenotypeBuilder(acGT).make())}); + tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(ccGT).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), Arrays.asList(new GenotypeBuilder(ccGT).make())}); + + // uninformative test cases + final Genotype uninformativeGT = new GenotypeBuilder(base).alleles(CC).noAD().PL(uninformative).GQ(0).make(); + tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(uninformativeGT).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), Arrays.asList(uninformativeGT)}); + final Genotype emptyGT = new GenotypeBuilder(base).alleles(GATKVariantContextUtils.NO_CALL_ALLELES).noAD().noPL().noGQ().make(); + tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(emptyGT).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), Arrays.asList(emptyGT)}); + + // actually subsetting down from multiple alt values + final double[] homRef3AllelesPL = new double[]{0, -10, -20, -30, -40, -50}; + final double[] hetRefC3AllelesPL = new double[]{-10, 0, -20, -30, -40, -50}; + final double[] homC3AllelesPL = new double[]{-20, -10, 0, -30, -40, -50}; + final double[] hetRefG3AllelesPL = new double[]{-20, -10, -30, 0, -40, -50}; + final double[] hetCG3AllelesPL = new double[]{-20, -10, -30, -40, 0, -50}; // AA, AC, CC, AG, CG, GG + final double[] homG3AllelesPL = new double[]{-20, -10, -30, -40, -50, 0}; // AA, AC, CC, AG, CG, GG + + final int[] homRef3AllelesAD = new int[]{20, 0, 1}; + final int[] hetRefC3AllelesAD = new int[]{10, 10, 1}; + final int[] homC3AllelesAD = new int[]{0, 20, 1}; + final int[] hetRefG3AllelesAD = new int[]{10, 0, 11}; + final int[] hetCG3AllelesAD = new int[]{0, 12, 11}; // AA, AC, CC, AG, CG, GG + final int[] homG3AllelesAD = new int[]{0, 1, 21}; // AA, AC, CC, AG, CG, GG + + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(homRef3AllelesAD).PL(homRef3AllelesPL).make()).make(), + new VariantContextBuilder(vcBase).alleles(AC).make(), + Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{0, -10, -20}).AD(new int[]{20, 0}).GQ(100).make())}); + + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(hetRefC3AllelesAD).PL(hetRefC3AllelesPL).make()).make(), + new VariantContextBuilder(vcBase).alleles(AC).make(), + Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{-10, 0, -20}).AD(new int[]{10, 10}).GQ(100).make())}); + + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(homC3AllelesAD).PL(homC3AllelesPL).make()).make(), + new VariantContextBuilder(vcBase).alleles(AC).make(), + Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{-20, -10, 0}).AD(new int[]{0, 20}).GQ(100).make())}); + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(hetRefG3AllelesAD).PL(hetRefG3AllelesPL).make()).make(), + new VariantContextBuilder(vcBase).alleles(AG).make(), + Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{-20, 0, -50}).AD(new int[]{10, 11}).GQ(100).make())}); + + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(hetCG3AllelesAD).PL(hetCG3AllelesPL).make()).make(), + new VariantContextBuilder(vcBase).alleles(AG).make(), + Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{0, -20, -30}).AD(new int[]{0, 11}).GQ(100).make())}); + + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(homG3AllelesAD).PL(homG3AllelesPL).make()).make(), + new VariantContextBuilder(vcBase).alleles(AG).make(), + Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{-20, -40, 0}).AD(new int[]{0, 21}).GQ(100).make())}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "updatePLsAndADData") + public void testUpdatePLsAndADData(final VariantContext originalVC, + final VariantContext selectedVC, + final List expectedGenotypes) { + final VariantContext selectedVCwithGTs = new VariantContextBuilder(selectedVC).genotypes(originalVC.getGenotypes()).make(); + final GenotypesContext actual = GATKVariantContextUtils.updatePLsAndAD(selectedVCwithGTs, originalVC); + + Assert.assertEquals(actual.size(), expectedGenotypes.size()); + for ( final Genotype expected : expectedGenotypes ) { + final Genotype actualGT = actual.get(expected.getSampleName()); + Assert.assertNotNull(actualGT); + assertGenotypesAreEqual(actualGT, expected); + } + } + + // -------------------------------------------------------------------------------- + // + // Test methods for merging reference confidence VCs + // + // -------------------------------------------------------------------------------- + + + @Test(dataProvider = "indexOfAlleleData") + public void testIndexOfAllele(final Allele reference, final List altAlleles, final List otherAlleles) { + final List alleles = new ArrayList<>(altAlleles.size() + 1); + alleles.add(reference); + alleles.addAll(altAlleles); + final VariantContext vc = makeVC("Source", alleles); + + for (int i = 0; i < alleles.size(); i++) { + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,alleles.get(i),true,true,true),i); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,alleles.get(i),false,true,true),i); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,alleles.get(i),true,true,false),i); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,alleles.get(i),false,true,false),i); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,Allele.create(alleles.get(i),true),true,true,true),i); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,Allele.create(alleles.get(i),true),true,true,false),-1); + if (i == 0) { + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,alleles.get(i),true,false,true),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,alleles.get(i),false,false,true),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,alleles.get(i),true,false,false),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,alleles.get(i),false,false,false),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,Allele.create(alleles.get(i).getBases(),true),false,true,true),i); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,Allele.create(alleles.get(i).getBases(),false),false,true,true),-1); + } else { + Assert.assertEquals(GATKVariantContextUtils.indexOfAltAllele(vc,alleles.get(i),true),i - 1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAltAllele(vc,alleles.get(i),false), i - 1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAltAllele(vc,Allele.create(alleles.get(i),true),true),i-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAltAllele(vc,Allele.create(alleles.get(i),true),false),-1); + } + } + + for (final Allele other : otherAlleles) { + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc, other, true, true, true), -1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,other,false,true,true),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,other,true,true,false),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,other,false,true,false),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,other,true,false,true),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,other,false,false,true),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,other,true,false,false),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc, other, false, false, false),-1); + } + } + + @DataProvider(name = "indexOfAlleleData") + public Iterator indexOfAlleleData() { + + final Allele[] ALTERNATIVE_ALLELES = new Allele[] { T, C, G, ATC, ATCATC}; + + final int lastMask = 0x1F; + + return new Iterator() { + + int nextMask = 0; + + @Override + public boolean hasNext() { + return nextMask <= lastMask; + } + + @Override + public Object[] next() { + + int mask = nextMask++; + final List includedAlleles = new ArrayList<>(5); + final List excludedAlleles = new ArrayList<>(5); + for (int i = 0; i < ALTERNATIVE_ALLELES.length; i++) { + ((mask & 1) == 1 ? includedAlleles : excludedAlleles).add(ALTERNATIVE_ALLELES[i]); + mask >>= 1; + } + return new Object[] { Aref , includedAlleles, excludedAlleles}; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + + + @Test(dataProvider = "generatePLsData") + public void testGeneratePLs(final int numOriginalAlleles, final int[] indexOrdering) { + + final int numLikelihoods = GenotypeLikelihoods.numLikelihoods(numOriginalAlleles, 2); + final int[] PLs = new int[numLikelihoods]; + for ( int i = 0; i < numLikelihoods; i++ ) + PLs[i] = i; + + final List alleles = new ArrayList<>(numOriginalAlleles); + alleles.add(Allele.create("A", true)); + for ( int i = 1; i < numOriginalAlleles; i++ ) + alleles.add(Allele.create(Utils.dupString('A', i + 1), false)); + final Genotype genotype = new GenotypeBuilder("foo", alleles).PL(PLs).make(); + + final int[] newPLs = GATKVariantContextUtils.generatePLs(genotype, indexOrdering); + + Assert.assertEquals(newPLs.length, numLikelihoods); + + final int[] expectedPLs = new int[numLikelihoods]; + for ( int i = 0; i < numOriginalAlleles; i++ ) { + for ( int j = i; j < numOriginalAlleles; j++ ) { + final int index = GenotypeLikelihoods.calculatePLindex(i, j); + final int value = GATKVariantContextUtils.calculatePLindexFromUnorderedIndexes(indexOrdering[i], indexOrdering[j]); + expectedPLs[index] = value; + } + } + + for ( int i = 0; i < numLikelihoods; i++ ) { + Assert.assertEquals(newPLs[i], expectedPLs[i]); + } + } + + @Test(dataProvider = "referenceConfidenceMergeData") + public void testReferenceConfidenceMerge(final String testID, final List toMerge, final GenomeLoc loc, final boolean returnSiteEvenIfMonomorphic, final VariantContext expectedResult) { + final VariantContext result = GATKVariantContextUtils.referenceConfidenceMerge(toMerge, loc, returnSiteEvenIfMonomorphic ? (byte) 'A' : null, true); + if ( result == null ) { + Assert.assertTrue(expectedResult == null); + return; + } + Assert.assertEquals(result.getAlleles(), expectedResult.getAlleles(),testID); + Assert.assertEquals(result.getNSamples(), expectedResult.getNSamples(),testID); + for ( final Genotype expectedGenotype : expectedResult.getGenotypes() ) { + Assert.assertTrue(result.hasGenotype(expectedGenotype.getSampleName()), "Missing " + expectedGenotype.getSampleName()); + // use string comparisons to test equality for now + Assert.assertEquals(result.getGenotype(expectedGenotype.getSampleName()).toString(), expectedGenotype.toString()); + } + } + + @Test + public void testGenerateADWithNewAlleles() { + + final int[] originalAD = new int[] {1,2,0}; + final int[] indexesOfRelevantAlleles = new int[] {0,1,2,2}; + + final int[] newAD = GATKVariantContextUtils.generateAD(originalAD, indexesOfRelevantAlleles); + Assert.assertEquals(newAD, new int[]{1,2,0,0}); + } + + + @Test(expectedExceptions = UserException.class) + public void testGetIndexesOfRelevantAllelesWithNoALT() { + + final List alleles1 = new ArrayList<>(1); + alleles1.add(Allele.create("A", true)); + final List alleles2 = new ArrayList<>(1); + alleles2.add(Allele.create("A", true)); + GATKVariantContextUtils.getIndexesOfRelevantAlleles(alleles1, alleles2, -1); + Assert.fail("We should have thrown an exception because the allele was not present"); + } + + @Test(dataProvider = "getIndexesOfRelevantAllelesData") + public void testGetIndexesOfRelevantAlleles(final int allelesIndex, final List allAlleles) { + final List myAlleles = new ArrayList<>(3); + + // always add the reference and alleles + myAlleles.add(allAlleles.get(0)); + myAlleles.add(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + // optionally add another alternate allele + if ( allelesIndex > 0 ) + myAlleles.add(allAlleles.get(allelesIndex)); + + final int[] indexes = GATKVariantContextUtils.getIndexesOfRelevantAlleles(myAlleles, allAlleles, -1); + + Assert.assertEquals(indexes.length, allAlleles.size()); + + for ( int i = 0; i < allAlleles.size(); i++ ) { + if ( i == 0 ) + Assert.assertEquals(indexes[i], 0); // ref should always match + else if ( i == allelesIndex ) + Assert.assertEquals(indexes[i], 2); // allele + else + Assert.assertEquals(indexes[i], 1); // + } + } + + + @DataProvider(name = "getIndexesOfRelevantAllelesData") + public Object[][] makeGetIndexesOfRelevantAllelesData() { + final int totalAlleles = 5; + final List alleles = new ArrayList<>(totalAlleles); + alleles.add(Allele.create("A", true)); + for ( int i = 1; i < totalAlleles; i++ ) + alleles.add(Allele.create(Utils.dupString('A', i + 1), false)); + + final List tests = new ArrayList<>(); + + for ( int alleleIndex = 0; alleleIndex < totalAlleles; alleleIndex++ ) { + tests.add(new Object[]{alleleIndex, alleles}); + } + + return tests.toArray(new Object[][]{}); + } + + @DataProvider(name = "referenceConfidenceMergeData") + public Object[][] makeReferenceConfidenceMergeData() { + final List tests = new ArrayList<>(); + final int start = 10; + final GenomeLoc loc = new UnvalidatingGenomeLoc("20", 0, start, start); + final VariantContext VCbase = new VariantContextBuilder("test", "20", start, start, Arrays.asList(Aref)).make(); + final VariantContext VCprevBase = new VariantContextBuilder("test", "20", start-1, start-1, Arrays.asList(Aref)).make(); + + final int[] standardPLs = new int[]{30, 20, 10, 71, 72, 73}; + final int[] reorderedSecondAllelePLs = new int[]{30, 71, 73, 20, 72, 10}; + + final List noCalls = new ArrayList<>(2); + noCalls.add(Allele.NO_CALL); + noCalls.add(Allele.NO_CALL); + + final List A_ALT = Arrays.asList(Aref, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gA_ALT = new GenotypeBuilder("A").PL(new int[]{0, 100, 1000}).alleles(noCalls).make(); + final VariantContext vcA_ALT = new VariantContextBuilder(VCbase).alleles(A_ALT).genotypes(gA_ALT).make(); + final Allele AAref = Allele.create("AA", true); + final List AA_ALT = Arrays.asList(AAref, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gAA_ALT = new GenotypeBuilder("AA").PL(new int[]{0, 80, 800}).alleles(noCalls).make(); + final VariantContext vcAA_ALT = new VariantContextBuilder(VCprevBase).alleles(AA_ALT).genotypes(gAA_ALT).make(); + final List A_C = Arrays.asList(Aref, C); + final Genotype gA_C = new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10}).alleles(noCalls).make(); + final List A_C_ALT = Arrays.asList(Aref, C, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gA_C_ALT = new GenotypeBuilder("A_C").PL(standardPLs).alleles(noCalls).make(); + final VariantContext vcA_C_ALT = new VariantContextBuilder(VCbase).alleles(A_C_ALT).genotypes(gA_C_ALT).make(); + final List A_G_ALT = Arrays.asList(Aref, G, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gA_G_ALT = new GenotypeBuilder("A_G").PL(standardPLs).alleles(noCalls).make(); + final VariantContext vcA_G_ALT = new VariantContextBuilder(VCbase).alleles(A_G_ALT).genotypes(gA_G_ALT).make(); + final List A_C_G = Arrays.asList(Aref, C, G); + final Genotype gA_C_G = new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 20, 10, 30}).alleles(noCalls).make(); + final List A_C_G_ALT = Arrays.asList(Aref, C, G, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gA_C_G_ALT = new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 20, 10, 30, 71, 72, 73, 74}).alleles(noCalls).make(); + final VariantContext vcA_C_G_ALT = new VariantContextBuilder(VCbase).alleles(A_C_G_ALT).genotypes(gA_C_G_ALT).make(); + final List A_ATC_ALT = Arrays.asList(Aref, ATC, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gA_ATC_ALT = new GenotypeBuilder("A_ATC").PL(standardPLs).alleles(noCalls).make(); + final VariantContext vcA_ATC_ALT = new VariantContextBuilder(VCbase).alleles(A_ATC_ALT).genotypes(gA_ATC_ALT).make(); + final Allele A = Allele.create("A", false); + final List AA_A_ALT = Arrays.asList(AAref, A, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gAA_A_ALT = new GenotypeBuilder("AA_A").PL(standardPLs).alleles(noCalls).make(); + final VariantContext vcAA_A_ALT = new VariantContextBuilder(VCprevBase).alleles(AA_A_ALT).genotypes(gAA_A_ALT).make(); + + // first test the case of a single record + tests.add(new Object[]{"test00",Arrays.asList(vcA_C_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C).make()}); + + // now, test pairs: + // a SNP with another SNP + tests.add(new Object[]{"test01",Arrays.asList(vcA_C_ALT, vcA_G_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(gA_C_ALT, new GenotypeBuilder("A_G").PL(reorderedSecondAllelePLs).alleles(noCalls).make()).make()}); + // a SNP with an indel + tests.add(new Object[]{"test02",Arrays.asList(vcA_C_ALT, vcA_ATC_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, ATC)).genotypes(gA_C_ALT, new GenotypeBuilder("A_ATC").PL(reorderedSecondAllelePLs).alleles(noCalls).make()).make()}); + // a SNP with 2 SNPs + tests.add(new Object[]{"test03",Arrays.asList(vcA_C_ALT, vcA_C_G_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(gA_C_ALT, gA_C_G).make()}); + // a SNP with a ref record + tests.add(new Object[]{"test04",Arrays.asList(vcA_C_ALT, vcA_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, gA_ALT).make()}); + + // spanning records: + // a SNP with a spanning ref record + tests.add(new Object[]{"test05",Arrays.asList(vcA_C_ALT, vcAA_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, gAA_ALT).make()}); + // a SNP with a spanning deletion + tests.add(new Object[]{"test06",Arrays.asList(vcA_C_ALT, vcAA_A_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73}).alleles(noCalls).make()).make()}); + + // combination of all + tests.add(new Object[]{"test07",Arrays.asList(vcA_C_ALT, vcA_G_ALT, vcA_ATC_ALT, vcA_C_G_ALT, vcA_ALT, vcAA_ALT, vcAA_A_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, G, ATC)).genotypes(new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10, 71, 72, 73, 71, 72, 73, 73}).alleles(noCalls).make(), + new GenotypeBuilder("A_G").PL(new int[]{30, 71, 73, 20, 72, 10, 71, 73, 72, 73}).alleles(noCalls).make(), + new GenotypeBuilder("A_ATC").PL(new int[]{30, 71, 73, 71, 73, 73, 20, 72, 72, 10}).alleles(noCalls).make(), + new GenotypeBuilder("A_C_G").PL(new int[]{40,20,30,20,10,30,71,72,73,74}).alleles(noCalls).make(), + new GenotypeBuilder("A").PL(new int[]{0, 100, 1000, 100, 1000, 1000, 100, 1000, 1000, 1000}).alleles(noCalls).make(), + new GenotypeBuilder("AA").PL(new int[]{0, 80, 800, 80, 800, 800, 80, 800, 800, 800}).alleles(noCalls).make(), + new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73, 71, 73, 73, 71, 73, 73, 73}).alleles(noCalls).make()).make()}); + + // just spanning ref contexts, trying both instances where we want/do not want ref-only contexts + tests.add(new Object[]{"test08",Arrays.asList(vcAA_ALT), + + loc, false, + null}); + tests.add(new Object[]{"test09", Arrays.asList(vcAA_ALT), + loc, true, + new VariantContextBuilder(VCbase).alleles(Arrays.asList(Allele.create("A", true))).genotypes(new GenotypeBuilder("AA").PL(new int[]{0}).alleles(noCalls).make()).make()}); + + final Object[][] result = tests.toArray(new Object[][]{}); + return result; + } + + @DataProvider(name = "generatePLsData") + public Object[][] makeGeneratePLsData() { + final List tests = new ArrayList<>(); + + for ( int originalAlleles = 2; originalAlleles <= 5; originalAlleles++ ) { + for ( int swapPosition1 = 0; swapPosition1 < originalAlleles; swapPosition1++ ) { + for ( int swapPosition2 = swapPosition1+1; swapPosition2 < originalAlleles; swapPosition2++ ) { + final int[] indexes = new int[originalAlleles]; + for ( int i = 0; i < originalAlleles; i++ ) + indexes[i] = i; + indexes[swapPosition1] = swapPosition2; + indexes[swapPosition2] = swapPosition1; + tests.add(new Object[]{originalAlleles, indexes}); + } + } + } + return tests.toArray(new Object[][]{}); + } +} + diff --git a/public/java/test/org/broadinstitute/sting/utils/variant/VCFIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/variant/VCFIntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/variant/VCFIntegrationTest.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/variant/VCFIntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/variant/VariantContextBenchmark.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/variant/VariantContextBenchmark.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/utils/variant/VariantContextBenchmark.java rename to public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/variant/VariantContextBenchmark.java diff --git a/public/testdata/exampleBAM.bam b/public/gatk-framework/src/test/resources/exampleBAM.bam similarity index 100% rename from public/testdata/exampleBAM.bam rename to public/gatk-framework/src/test/resources/exampleBAM.bam diff --git a/public/testdata/exampleBAM.bam.bai b/public/gatk-framework/src/test/resources/exampleBAM.bam.bai similarity index 100% rename from public/testdata/exampleBAM.bam.bai rename to public/gatk-framework/src/test/resources/exampleBAM.bam.bai diff --git a/public/testdata/exampleBAM.simple.bai b/public/gatk-framework/src/test/resources/exampleBAM.simple.bai similarity index 100% rename from public/testdata/exampleBAM.simple.bai rename to public/gatk-framework/src/test/resources/exampleBAM.simple.bai diff --git a/public/testdata/exampleBAM.simple.bam b/public/gatk-framework/src/test/resources/exampleBAM.simple.bam similarity index 100% rename from public/testdata/exampleBAM.simple.bam rename to public/gatk-framework/src/test/resources/exampleBAM.simple.bam diff --git a/public/testdata/exampleDBSNP.vcf b/public/gatk-framework/src/test/resources/exampleDBSNP.vcf similarity index 100% rename from public/testdata/exampleDBSNP.vcf rename to public/gatk-framework/src/test/resources/exampleDBSNP.vcf diff --git a/public/testdata/exampleDBSNP.vcf.idx b/public/gatk-framework/src/test/resources/exampleDBSNP.vcf.idx similarity index 100% rename from public/testdata/exampleDBSNP.vcf.idx rename to public/gatk-framework/src/test/resources/exampleDBSNP.vcf.idx diff --git a/public/testdata/exampleFASTA-3contigs.fasta b/public/gatk-framework/src/test/resources/exampleFASTA-3contigs.fasta similarity index 100% rename from public/testdata/exampleFASTA-3contigs.fasta rename to public/gatk-framework/src/test/resources/exampleFASTA-3contigs.fasta diff --git a/public/testdata/exampleFASTA-combined.fasta b/public/gatk-framework/src/test/resources/exampleFASTA-combined.fasta similarity index 100% rename from public/testdata/exampleFASTA-combined.fasta rename to public/gatk-framework/src/test/resources/exampleFASTA-combined.fasta diff --git a/public/testdata/exampleFASTA-windows.fasta b/public/gatk-framework/src/test/resources/exampleFASTA-windows.fasta similarity index 100% rename from public/testdata/exampleFASTA-windows.fasta rename to public/gatk-framework/src/test/resources/exampleFASTA-windows.fasta diff --git a/public/testdata/exampleFASTA.dict b/public/gatk-framework/src/test/resources/exampleFASTA.dict similarity index 100% rename from public/testdata/exampleFASTA.dict rename to public/gatk-framework/src/test/resources/exampleFASTA.dict diff --git a/public/testdata/exampleFASTA.fasta b/public/gatk-framework/src/test/resources/exampleFASTA.fasta similarity index 100% rename from public/testdata/exampleFASTA.fasta rename to public/gatk-framework/src/test/resources/exampleFASTA.fasta diff --git a/public/testdata/exampleFASTA.fasta.amb b/public/gatk-framework/src/test/resources/exampleFASTA.fasta.amb similarity index 100% rename from public/testdata/exampleFASTA.fasta.amb rename to public/gatk-framework/src/test/resources/exampleFASTA.fasta.amb diff --git a/public/testdata/exampleFASTA.fasta.ann b/public/gatk-framework/src/test/resources/exampleFASTA.fasta.ann similarity index 100% rename from public/testdata/exampleFASTA.fasta.ann rename to public/gatk-framework/src/test/resources/exampleFASTA.fasta.ann diff --git a/public/testdata/exampleFASTA.fasta.bwt b/public/gatk-framework/src/test/resources/exampleFASTA.fasta.bwt similarity index 100% rename from public/testdata/exampleFASTA.fasta.bwt rename to public/gatk-framework/src/test/resources/exampleFASTA.fasta.bwt diff --git a/public/testdata/exampleFASTA.fasta.fai b/public/gatk-framework/src/test/resources/exampleFASTA.fasta.fai similarity index 100% rename from public/testdata/exampleFASTA.fasta.fai rename to public/gatk-framework/src/test/resources/exampleFASTA.fasta.fai diff --git a/public/testdata/exampleFASTA.fasta.pac b/public/gatk-framework/src/test/resources/exampleFASTA.fasta.pac similarity index 100% rename from public/testdata/exampleFASTA.fasta.pac rename to public/gatk-framework/src/test/resources/exampleFASTA.fasta.pac diff --git a/public/testdata/exampleFASTA.fasta.rbwt b/public/gatk-framework/src/test/resources/exampleFASTA.fasta.rbwt similarity index 100% rename from public/testdata/exampleFASTA.fasta.rbwt rename to public/gatk-framework/src/test/resources/exampleFASTA.fasta.rbwt diff --git a/public/testdata/exampleFASTA.fasta.rpac b/public/gatk-framework/src/test/resources/exampleFASTA.fasta.rpac similarity index 100% rename from public/testdata/exampleFASTA.fasta.rpac rename to public/gatk-framework/src/test/resources/exampleFASTA.fasta.rpac diff --git a/public/testdata/exampleFASTA.fasta.rsa b/public/gatk-framework/src/test/resources/exampleFASTA.fasta.rsa similarity index 100% rename from public/testdata/exampleFASTA.fasta.rsa rename to public/gatk-framework/src/test/resources/exampleFASTA.fasta.rsa diff --git a/public/testdata/exampleFASTA.fasta.sa b/public/gatk-framework/src/test/resources/exampleFASTA.fasta.sa similarity index 100% rename from public/testdata/exampleFASTA.fasta.sa rename to public/gatk-framework/src/test/resources/exampleFASTA.fasta.sa diff --git a/public/testdata/exampleGATKReport.eval b/public/gatk-framework/src/test/resources/exampleGATKReport.eval similarity index 100% rename from public/testdata/exampleGATKReport.eval rename to public/gatk-framework/src/test/resources/exampleGATKReport.eval diff --git a/public/testdata/exampleGATKReportv1.tbl b/public/gatk-framework/src/test/resources/exampleGATKReportv1.tbl similarity index 100% rename from public/testdata/exampleGATKReportv1.tbl rename to public/gatk-framework/src/test/resources/exampleGATKReportv1.tbl diff --git a/public/testdata/exampleGATKReportv2.tbl b/public/gatk-framework/src/test/resources/exampleGATKReportv2.tbl similarity index 100% rename from public/testdata/exampleGATKReportv2.tbl rename to public/gatk-framework/src/test/resources/exampleGATKReportv2.tbl diff --git a/public/testdata/exampleGRP.grp b/public/gatk-framework/src/test/resources/exampleGRP.grp similarity index 100% rename from public/testdata/exampleGRP.grp rename to public/gatk-framework/src/test/resources/exampleGRP.grp diff --git a/public/testdata/exampleINTERVAL.intervals b/public/gatk-framework/src/test/resources/exampleINTERVAL.intervals similarity index 100% rename from public/testdata/exampleINTERVAL.intervals rename to public/gatk-framework/src/test/resources/exampleINTERVAL.intervals diff --git a/public/testdata/exampleNORG.bam b/public/gatk-framework/src/test/resources/exampleNORG.bam similarity index 100% rename from public/testdata/exampleNORG.bam rename to public/gatk-framework/src/test/resources/exampleNORG.bam diff --git a/public/testdata/exampleNORG.bam.bai b/public/gatk-framework/src/test/resources/exampleNORG.bam.bai similarity index 100% rename from public/testdata/exampleNORG.bam.bai rename to public/gatk-framework/src/test/resources/exampleNORG.bam.bai diff --git a/public/gatk-framework/src/test/resources/forSimulation.vcf b/public/gatk-framework/src/test/resources/forSimulation.vcf new file mode 100644 index 000000000..a0c57c2c0 --- /dev/null +++ b/public/gatk-framework/src/test/resources/forSimulation.vcf @@ -0,0 +1,8 @@ +##fileformat=VCFv4.1 +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA12891 NA12892 +20 10000000 . T C . . . GT 0/1 0/0 1/1 +20 10001000 . GG AA . . . GT 0/1 0/0 1/1 +20 10002000 . TAGTA T . . . GT 0/1 0/0 1/1 +20 10003000 . A AGCT . . . GT 0/1 0/0 1/1 +20 10004000 . GAT G,GATAT . . . GT 0/1 0/0 1/1 diff --git a/public/gatk-framework/src/test/resources/forSimulation.vcf.idx b/public/gatk-framework/src/test/resources/forSimulation.vcf.idx new file mode 100644 index 000000000..4f734b7af Binary files /dev/null and b/public/gatk-framework/src/test/resources/forSimulation.vcf.idx differ diff --git a/public/gatk-framework/src/test/resources/testProperties.properties b/public/gatk-framework/src/test/resources/testProperties.properties new file mode 100644 index 000000000..e422d6eb1 --- /dev/null +++ b/public/gatk-framework/src/test/resources/testProperties.properties @@ -0,0 +1,2 @@ +foo=bar +version=1.0 diff --git a/public/testdata/testfile.sam b/public/gatk-framework/src/test/resources/testfile.sam similarity index 100% rename from public/testdata/testfile.sam rename to public/gatk-framework/src/test/resources/testfile.sam diff --git a/public/gatk-package/pom.xml b/public/gatk-package/pom.xml new file mode 100644 index 000000000..9272cf983 --- /dev/null +++ b/public/gatk-package/pom.xml @@ -0,0 +1,286 @@ + + + 4.0.0 + + + org.broadinstitute.sting + sting-aggregator + 2.8-SNAPSHOT + ../.. + + + gatk-package + jar + GATK Package + + + ${project.basedir}/../.. + prepare-package + package + org.broadinstitute.sting.gatk.CommandLineGATK + GenomeAnalysisTK + + + + + + ${project.groupId} + gatk-framework + ${project.version} + + + + org.broad + tribble + + + + org.broadinstitute + variant + + + + commons-logging + commons-logging + + + + ${project.groupId} + gatk-framework + ${project.version} + example-resources + tar.bz2 + + + + ${project.groupId} + gatk-framework + ${project.version} + test-jar + test + + + + org.testng + testng + test + + + + com.google.caliper + caliper + test + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + + + unit-tests + + ${sting.serialunittests.skipped} + + org.broadinstitute.sting:.* + + + + + + + + org.apache.maven.plugins + maven-failsafe-plugin + + + integration-tests + + ${sting.serialintegrationtests.skipped} + + org.broadinstitute.sting:.* + + + + + pipeline-tests + + ${sting.serialpipelinetests.skipped} + + org.broadinstitute.sting:.* + + + + + large-scale-tests + + ${sting.seriallargescaletests.skipped} + + org.broadinstitute.sting:.* + + + + + knowledge-base-tests + + ${sting.serialknowledgebasetests.skipped} + + org.broadinstitute.sting:.* + + + + + + + + org.apache.maven.plugins + maven-dependency-plugin + + + unpack-direct-dependencies + ${sting.unpack.phase} + + + + + + org.apache.maven.plugins + maven-shade-plugin + + + sting-executable + ${sting.shade.phase} + + + + + + org.apache.maven.plugins + maven-assembly-plugin + + + binary-dist + ${sting.shade.phase} + + + + + + com.pyx4j + maven-junction-plugin + + + link-binary-jar + ${sting.shade.phase} + + + link-git-release + ${sting.shade.phase} + + + + + + org.apache.maven.plugins + maven-install-plugin + + + default-install + none + + + install-package + install + + + + + + + + + + protected + + + ${basedir}/../../protected/gatk-protected/pom.xml + + + + + ${project.groupId} + gatk-protected + ${project.version} + true + + + ${project.groupId} + gatk-protected + ${project.version} + test-jar + test + true + + + + + private + + + ${basedir}/../../private/gatk-private/pom.xml + + + + + ${project.groupId} + gatk-private + ${project.version} + true + + + ${project.groupId} + gatk-private + ${project.version} + test-jar + test + true + + + + + + + com.pyx4j + maven-junction-plugin + + + link-private-testdata + process-test-resources + + + unlink-private-testdata + clean + + + + + + + + packagetests-enabled + + + sting.packagetests.enabled + true + + + + none + none + + + + + diff --git a/public/gatk-package/src/main/assembly/binary-dist.xml b/public/gatk-package/src/main/assembly/binary-dist.xml new file mode 100644 index 000000000..adc52646c --- /dev/null +++ b/public/gatk-package/src/main/assembly/binary-dist.xml @@ -0,0 +1,22 @@ + + binary-dist + + tar.bz2 + + false + + + + org.broadinstitute.sting:gatk-package + + ${sting.binary-dist.name}.${artifact.extension} + + + resources + true + + org.broadinstitute.sting:gatk-framework:tar.bz2:example-resources + + + + diff --git a/public/gatk-queue-extgen/pom.xml b/public/gatk-queue-extgen/pom.xml new file mode 100644 index 000000000..532515ff8 --- /dev/null +++ b/public/gatk-queue-extgen/pom.xml @@ -0,0 +1,29 @@ + + + 4.0.0 + + + org.broadinstitute.sting + sting-aggregator + 2.8-SNAPSHOT + ../.. + + + gatk-queue-extgen + jar + Queue GATK ExtGen + Queue GATK Extensions Generator + + + ${project.basedir}/../.. + + + + + ${project.groupId} + gatk-framework + ${project.version} + + + + diff --git a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentDefinitionField.java b/public/gatk-queue-extgen/src/main/java/org/broadinstitute/sting/queue/extensions/gatk/ArgumentDefinitionField.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentDefinitionField.java rename to public/gatk-queue-extgen/src/main/java/org/broadinstitute/sting/queue/extensions/gatk/ArgumentDefinitionField.java diff --git a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentField.java b/public/gatk-queue-extgen/src/main/java/org/broadinstitute/sting/queue/extensions/gatk/ArgumentField.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentField.java rename to public/gatk-queue-extgen/src/main/java/org/broadinstitute/sting/queue/extensions/gatk/ArgumentField.java diff --git a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java b/public/gatk-queue-extgen/src/main/java/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java rename to public/gatk-queue-extgen/src/main/java/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java diff --git a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ReadFilterField.java b/public/gatk-queue-extgen/src/main/java/org/broadinstitute/sting/queue/extensions/gatk/ReadFilterField.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ReadFilterField.java rename to public/gatk-queue-extgen/src/main/java/org/broadinstitute/sting/queue/extensions/gatk/ReadFilterField.java diff --git a/public/gsalib/pom.xml b/public/gsalib/pom.xml new file mode 100644 index 000000000..66845f584 --- /dev/null +++ b/public/gsalib/pom.xml @@ -0,0 +1,45 @@ + + + 4.0.0 + + + org.broadinstitute.sting + sting-aggregator + 2.8-SNAPSHOT + ../.. + + + gsalib + pom + Sting GSALib + + + ${project.basedir}/../.. + org/broadinstitute/sting/utils/R + gsalib.tar.gz + + + + + + org.apache.maven.plugins + maven-assembly-plugin + + + gsalib-assembly + + single + + ${sting.generate-resources.phase} + + false + + src/assembly/gsalib.xml + + + + + + + + diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/DESCRIPTION b/public/gsalib/src/R/DESCRIPTION similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/DESCRIPTION rename to public/gsalib/src/R/DESCRIPTION diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/NAMESPACE b/public/gsalib/src/R/NAMESPACE similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/NAMESPACE rename to public/gsalib/src/R/NAMESPACE diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.error.R b/public/gsalib/src/R/R/gsa.error.R similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.error.R rename to public/gsalib/src/R/R/gsa.error.R diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.getargs.R b/public/gsalib/src/R/R/gsa.getargs.R similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.getargs.R rename to public/gsalib/src/R/R/gsa.getargs.R diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.message.R b/public/gsalib/src/R/R/gsa.message.R similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.message.R rename to public/gsalib/src/R/R/gsa.message.R diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.plot.venn.R b/public/gsalib/src/R/R/gsa.plot.venn.R similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.plot.venn.R rename to public/gsalib/src/R/R/gsa.plot.venn.R diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.eval.R b/public/gsalib/src/R/R/gsa.read.eval.R similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.eval.R rename to public/gsalib/src/R/R/gsa.read.eval.R diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R b/public/gsalib/src/R/R/gsa.read.gatkreport.R similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R rename to public/gsalib/src/R/R/gsa.read.gatkreport.R diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.squidmetrics.R b/public/gsalib/src/R/R/gsa.read.squidmetrics.R similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.squidmetrics.R rename to public/gsalib/src/R/R/gsa.read.squidmetrics.R diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.vcf.R b/public/gsalib/src/R/R/gsa.read.vcf.R similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.vcf.R rename to public/gsalib/src/R/R/gsa.read.vcf.R diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R b/public/gsalib/src/R/R/gsa.variantqc.utils.R similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R rename to public/gsalib/src/R/R/gsa.variantqc.utils.R diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.warn.R b/public/gsalib/src/R/R/gsa.warn.R similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.warn.R rename to public/gsalib/src/R/R/gsa.warn.R diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/Read-and-delete-me b/public/gsalib/src/R/Read-and-delete-me similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/Read-and-delete-me rename to public/gsalib/src/R/Read-and-delete-me diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.error.Rd b/public/gsalib/src/R/man/gsa.error.Rd similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.error.Rd rename to public/gsalib/src/R/man/gsa.error.Rd diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.getargs.Rd b/public/gsalib/src/R/man/gsa.getargs.Rd similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.getargs.Rd rename to public/gsalib/src/R/man/gsa.getargs.Rd diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.message.Rd b/public/gsalib/src/R/man/gsa.message.Rd similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.message.Rd rename to public/gsalib/src/R/man/gsa.message.Rd diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.plot.venn.Rd b/public/gsalib/src/R/man/gsa.plot.venn.Rd similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.plot.venn.Rd rename to public/gsalib/src/R/man/gsa.plot.venn.Rd diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.read.eval.Rd b/public/gsalib/src/R/man/gsa.read.eval.Rd similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.read.eval.Rd rename to public/gsalib/src/R/man/gsa.read.eval.Rd diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.read.gatkreport.Rd b/public/gsalib/src/R/man/gsa.read.gatkreport.Rd similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.read.gatkreport.Rd rename to public/gsalib/src/R/man/gsa.read.gatkreport.Rd diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.read.squidmetrics.Rd b/public/gsalib/src/R/man/gsa.read.squidmetrics.Rd similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.read.squidmetrics.Rd rename to public/gsalib/src/R/man/gsa.read.squidmetrics.Rd diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.read.vcf.Rd b/public/gsalib/src/R/man/gsa.read.vcf.Rd similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.read.vcf.Rd rename to public/gsalib/src/R/man/gsa.read.vcf.Rd diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.warn.Rd b/public/gsalib/src/R/man/gsa.warn.Rd similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.warn.Rd rename to public/gsalib/src/R/man/gsa.warn.Rd diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsalib-package.Rd b/public/gsalib/src/R/man/gsalib-package.Rd similarity index 100% rename from public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsalib-package.Rd rename to public/gsalib/src/R/man/gsalib-package.Rd diff --git a/public/gsalib/src/assembly/gsalib.xml b/public/gsalib/src/assembly/gsalib.xml new file mode 100644 index 000000000..7650c713d --- /dev/null +++ b/public/gsalib/src/assembly/gsalib.xml @@ -0,0 +1,13 @@ + + gsalib + + tar.gz + + false + + + gsalib + src/R + + + diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java deleted file mode 100644 index a70d6e706..000000000 --- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java +++ /dev/null @@ -1,840 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.commandline; - -import org.apache.log4j.Logger; -import org.broad.tribble.Feature; -import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager; -import org.broadinstitute.sting.gatk.walkers.Multiplex; -import org.broadinstitute.sting.gatk.walkers.Multiplexer; -import org.broadinstitute.sting.utils.classloader.JVMUtils; -import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.io.File; -import java.lang.annotation.Annotation; -import java.lang.reflect.*; -import java.util.*; - -/** - * An descriptor capable of providing parsers that can parse any type - * of supported command-line argument. - * - * @author mhanna - * @version 0.1 - */ -public abstract class ArgumentTypeDescriptor { - private static Class[] ARGUMENT_ANNOTATIONS = {Input.class, Output.class, Argument.class}; - - /** - * our log, which we want to capture anything from org.broadinstitute.sting - */ - protected static final Logger logger = Logger.getLogger(ArgumentTypeDescriptor.class); - - /** - * Fetch the given descriptor from the descriptor repository. - * @param descriptors the descriptors from which to select a good match. - * @param type Class for which to specify a descriptor. - * @return descriptor for the given type. - */ - public static ArgumentTypeDescriptor selectBest( Collection descriptors, Class type ) { - for( ArgumentTypeDescriptor descriptor: descriptors ) { - if( descriptor.supports(type) ) - return descriptor; - } - throw new ReviewedStingException("Can't process command-line arguments of type: " + type.getName()); - } - - /** - * Does this descriptor support classes of the given type? - * @param type The type to check. - * @return true if this descriptor supports the given type, false otherwise. - */ - public abstract boolean supports( Class type ); - - /** - * Returns false if a type-specific default can be employed. - * @param source Source of the command-line argument. - * @return True to throw in a type specific default. False otherwise. - */ - public boolean createsTypeDefault(ArgumentSource source) { return false; } - - /** - * Returns a documentation-friendly value for the default of a type descriptor. - * Must be overridden if createsTypeDefault return true. cannot be called otherwise - * @param source Source of the command-line argument. - * @return Friendly string of the default value, for documentation. If doesn't create a default, throws - * and UnsupportedOperationException - */ - public String typeDefaultDocString(ArgumentSource source) { - throw new UnsupportedOperationException(); - } - - /** - * Generates a default for the given type. - * - * @param parsingEngine the parsing engine used to validate this argument type descriptor. - * @param source Source of the command-line argument. - * @param type Type of value to create, in case the command-line argument system wants influence. - * @return A default value for the given type. - */ - public Object createTypeDefault(ParsingEngine parsingEngine,ArgumentSource source, Type type) { throw new UnsupportedOperationException("Unable to create default for type " + getClass()); } - - /** - * Given the given argument source and attributes, synthesize argument definitions for command-line arguments. - * @param source Source class and field for the given argument. - * @return A list of command-line argument definitions supporting this field. - */ - public List createArgumentDefinitions( ArgumentSource source ) { - return Collections.singletonList(createDefaultArgumentDefinition(source)); - } - - /** - * Parses an argument source to an object. - * WARNING! Mandatory side effect of parsing! Each parse routine should register the tags it finds with the proper CommandLineProgram. - * TODO: Fix this, perhaps with an event model indicating that a new argument has been created. - * - * @param parsingEngine The engine responsible for parsing. - * @param source The source used to find the matches. - * @param matches The matches for the source. - * @return The parsed object. - */ - public Object parse(ParsingEngine parsingEngine, ArgumentSource source, ArgumentMatches matches) { - return parse(parsingEngine, source, source.field.getGenericType(), matches); - } - - /** - * Returns true if the field is a collection or an array. - * @param source The argument source to check. - * @return true if the field is a collection or an array. - */ - public boolean isMultiValued( ArgumentSource source ) { - Class argumentType = source.field.getType(); - return Collection.class.isAssignableFrom(argumentType) || argumentType.isArray(); - } - - /** - * By default, argument sources create argument definitions with a set of default values. - * Use this method to create the one simple argument definition. - * @param source argument source for which to create a default definition. - * @return The default definition for this argument source. - */ - protected ArgumentDefinition createDefaultArgumentDefinition( ArgumentSource source ) { - Annotation argumentAnnotation = getArgumentAnnotation(source); - return new ArgumentDefinition( ArgumentIOType.getIOType(argumentAnnotation), - source.field.getType(), - ArgumentDefinition.getFullName(argumentAnnotation, source.field.getName()), - ArgumentDefinition.getShortName(argumentAnnotation), - ArgumentDefinition.getDoc(argumentAnnotation), - source.isRequired() && !createsTypeDefault(source) && !source.isFlag() && !source.isDeprecated(), - source.isFlag(), - source.isMultiValued(), - source.isHidden(), - makeRawTypeIfNecessary(getCollectionComponentType(source.field)), - ArgumentDefinition.getExclusiveOf(argumentAnnotation), - ArgumentDefinition.getValidationRegex(argumentAnnotation), - getValidOptions(source) ); - } - - /** - * Return the component type of a field, or String.class if the type cannot be found. - * @param field The reflected field to inspect. - * @return The parameterized component type, or String.class if the parameterized type could not be found. - * @throws IllegalArgumentException If more than one parameterized type is found on the field. - */ - protected Type getCollectionComponentType( Field field ) { - return null; - } - - /** - * Parses the argument matches for a class type into an object. - * @param source The original argument source used to find the matches. - * @param type The current class type being inspected. May not match the argument source.field.getType() if this as a collection for example. - * @param matches The argument matches for the argument source, or the individual argument match for a scalar if this is being called to help parse a collection. - * @return The individual parsed object matching the argument match with Class type. - */ - public abstract Object parse( ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches ); - - /** - * If the argument source only accepts a small set of options, populate the returned list with - * those options. Otherwise, leave the list empty. - * @param source Original field specifying command-line arguments. - * @return A list of valid options. - */ - protected List getValidOptions( ArgumentSource source ) { - if(!source.field.getType().isEnum()) - return null; - List validOptions = new ArrayList(); - for(Object constant: source.field.getType().getEnumConstants()) - validOptions.add(constant.toString()); - return validOptions; - } - - /** - * Returns true if the argument with the given full name exists in the collection of ArgumentMatches. - * @param definition Definition of the argument for which to find matches. - * @param matches The matches for the given argument. - * @return true if the argument is present, or false if not present. - */ - protected boolean argumentIsPresent( ArgumentDefinition definition, ArgumentMatches matches ) { - for( ArgumentMatch match: matches ) { - if( match.definition.equals(definition) ) - return true; - } - return false; - } - - /** - * Gets the value of an argument with the given full name, from the collection of ArgumentMatches. - * If the argument matches multiple values, an exception will be thrown. - * @param definition Definition of the argument for which to find matches. - * @param matches The matches for the given argument. - * @return The value of the argument if available, or null if not present. - */ - protected ArgumentMatchValue getArgumentValue( ArgumentDefinition definition, ArgumentMatches matches ) { - Collection argumentValues = getArgumentValues( definition, matches ); - if( argumentValues.size() > 1 ) - throw new UserException.CommandLineException("Multiple values associated with given definition, but this argument expects only one: " + definition.fullName); - return argumentValues.size() > 0 ? argumentValues.iterator().next() : null; - } - - /** - * Gets the tags associated with a given command-line argument. - * If the argument matches multiple values, an exception will be thrown. - * @param matches The matches for the given argument. - * @return The value of the argument if available, or null if not present. - */ - protected Tags getArgumentTags(ArgumentMatches matches) { - Tags tags = new Tags(); - for(ArgumentMatch match: matches) { - if(!tags.isEmpty() && !match.tags.isEmpty()) - throw new ReviewedStingException("BUG: multiple conflicting sets of tags are available, and the type descriptor specifies no way of resolving the conflict."); - tags = match.tags; - } - return tags; - } - - /** - * Gets the values of an argument with the given full name, from the collection of ArgumentMatches. - * @param definition Definition of the argument for which to find matches. - * @param matches The matches for the given argument. - * @return The value of the argument if available, or an empty collection if not present. - */ - protected Collection getArgumentValues( ArgumentDefinition definition, ArgumentMatches matches ) { - Collection values = new ArrayList(); - for( ArgumentMatch match: matches ) { - if( match.definition.equals(definition) ) - values.addAll(match.values()); - } - return values; - } - - /** - * Retrieves the argument description from the given argument source. Will throw an exception if - * the given ArgumentSource - * @param source source of the argument. - * @return Argument description annotation associated with the given field. - */ - @SuppressWarnings("unchecked") - protected static Annotation getArgumentAnnotation( ArgumentSource source ) { - for (Class annotation: ARGUMENT_ANNOTATIONS) - if (source.field.isAnnotationPresent(annotation)) - return source.field.getAnnotation(annotation); - throw new ReviewedStingException("ArgumentAnnotation is not present for the argument field: " + source.field.getName()); - } - - /** - * Returns true if an argument annotation is present - * @param field The field to check for an annotation. - * @return True if an argument annotation is present on the field. - */ - @SuppressWarnings("unchecked") - public static boolean isArgumentAnnotationPresent(Field field) { - for (Class annotation: ARGUMENT_ANNOTATIONS) - if (field.isAnnotationPresent(annotation)) - return true; - return false; - } - - /** - * Returns true if the given annotation is hidden from the help system. - * @param field Field to test. - * @return True if argument should be hidden. False otherwise. - */ - public static boolean isArgumentHidden(Field field) { - return field.isAnnotationPresent(Hidden.class); - } - - public static Class makeRawTypeIfNecessary(Type t) { - if ( t == null ) - return null; - else if ( t instanceof ParameterizedType ) - return (Class)((ParameterizedType) t).getRawType(); - else if ( t instanceof Class ) { - return (Class)t; - } else { - throw new IllegalArgumentException("Unable to determine Class-derived component type of field: " + t); - } - } - - /** - * The actual argument parsing method. - * @param source source - * @param type type to check - * @param matches matches - * @return the RodBinding/IntervalBinding object depending on the value of createIntervalBinding. - */ - protected Object parseBinding(ArgumentSource source, Type type, ArgumentMatches matches, Tags tags) { - ArgumentDefinition defaultDefinition = createDefaultArgumentDefinition(source); - ArgumentMatchValue value = getArgumentValue(defaultDefinition, matches); - @SuppressWarnings("unchecked") - Class parameterType = JVMUtils.getParameterizedTypeClass(type); - String name = defaultDefinition.fullName; - - return parseBinding(value, parameterType, type, name, tags, source.field.getName()); - } - - /** - * - * @param value The source of the binding - * @param parameterType The Tribble Feature parameter type - * @param bindingClass The class type for the binding (ex: RodBinding, IntervalBinding, etc.) Must have the correct constructor for creating the binding. - * @param bindingName The name of the binding passed to the constructor. - * @param tags Tags for the binding used for parsing and passed to the constructor. - * @param fieldName The name of the field that was parsed. Used for error reporting. - * @return The newly created binding object of type bindingClass. - */ - public static Object parseBinding(ArgumentMatchValue value, Class parameterType, Type bindingClass, - String bindingName, Tags tags, String fieldName) { - try { - String tribbleType = null; - // must have one or two tag values here - if ( tags.getPositionalTags().size() > 2 ) { - throw new UserException.CommandLineException( - String.format("Unexpected number of positional tags for argument %s : %s. " + - "Rod bindings only support -X:type and -X:name,type argument styles", - value.asString(), fieldName)); - } else if ( tags.getPositionalTags().size() == 2 ) { - // -X:name,type style - bindingName = tags.getPositionalTags().get(0); - tribbleType = tags.getPositionalTags().get(1); - - FeatureManager manager = new FeatureManager(); - if ( manager.getByName(tribbleType) == null ) - throw new UserException.UnknownTribbleType( - tribbleType, - String.format("Unable to find tribble type '%s' provided on the command line. " + - "Please select a correct type from among the supported types:%n%s", - tribbleType, manager.userFriendlyListOfAvailableFeatures(parameterType))); - - } else { - // case with 0 or 1 positional tags - FeatureManager manager = new FeatureManager(); - - // -X:type style is a type when we cannot determine the type dynamically - String tag1 = tags.getPositionalTags().size() == 1 ? tags.getPositionalTags().get(0) : null; - if ( tag1 != null ) { - if ( manager.getByName(tag1) != null ) // this a type - tribbleType = tag1; - else - bindingName = tag1; - } - - if ( tribbleType == null ) { - // try to determine the file type dynamically - File file = value.asFile(); - if ( file.canRead() && file.isFile() ) { - FeatureManager.FeatureDescriptor featureDescriptor = manager.getByFiletype(file); - if ( featureDescriptor != null ) { - tribbleType = featureDescriptor.getName(); - logger.info("Dynamically determined type of " + file + " to be " + tribbleType); - } - } - - if ( tribbleType == null ) { - // IntervalBinding can be created from a normal String - Class rawType = (makeRawTypeIfNecessary(bindingClass)); - try { - return rawType.getConstructor(String.class).newInstance(value.asString()); - } catch (NoSuchMethodException e) { - /* ignore */ - } - - if ( ! file.exists() ) { - throw new UserException.CouldNotReadInputFile(file, "file does not exist"); - } else if ( ! file.canRead() || ! file.isFile() ) { - throw new UserException.CouldNotReadInputFile(file, "file could not be read"); - } else { - throw new UserException.CommandLineException( - String.format("No tribble type was provided on the command line and the type of the file could not be determined dynamically. " + - "Please add an explicit type tag :NAME listing the correct type from among the supported types:%n%s", - manager.userFriendlyListOfAvailableFeatures(parameterType))); - } - } - } - } - - Constructor ctor = (makeRawTypeIfNecessary(bindingClass)).getConstructor(Class.class, String.class, String.class, String.class, Tags.class); - return ctor.newInstance(parameterType, bindingName, value.asString(), tribbleType, tags); - } catch (Exception e) { - if ( e instanceof UserException ) - throw ((UserException)e); - else - throw new UserException.CommandLineException( - String.format("Failed to parse value %s for argument %s. Message: %s", - value, fieldName, e.getMessage())); - } - } -} - -/** - * Parser for RodBinding objects - */ -class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor { - /** - * We only want RodBinding class objects - * @param type The type to check. - * @return true if the provided class is a RodBinding.class - */ - @Override - public boolean supports( Class type ) { - return isRodBinding(type); - } - - public static boolean isRodBinding( Class type ) { - return RodBinding.class.isAssignableFrom(type); - } - - @Override - public boolean createsTypeDefault(ArgumentSource source) { return ! source.isRequired(); } - - @Override - @SuppressWarnings("unchecked") - public Object createTypeDefault(ParsingEngine parsingEngine, ArgumentSource source, Type type) { - Class parameterType = JVMUtils.getParameterizedTypeClass(type); - return RodBinding.makeUnbound((Class)parameterType); - } - - @Override - public String typeDefaultDocString(ArgumentSource source) { - return "none"; - } - - @Override - public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches) { - Tags tags = getArgumentTags(matches); - RodBinding rbind = (RodBinding)parseBinding(source, type, matches, tags); - parsingEngine.addTags(rbind, tags); - parsingEngine.addRodBinding(rbind); - return rbind; - } -} - -/** - * Parser for IntervalBinding objects - */ -class IntervalBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor { - /** - * We only want IntervalBinding class objects - * @param type The type to check. - * @return true if the provided class is an IntervalBinding.class - */ - @Override - public boolean supports( Class type ) { - return isIntervalBinding(type); - } - - public static boolean isIntervalBinding( Class type ) { - return IntervalBinding.class.isAssignableFrom(type); - } - - /** - * See note from RodBindingArgumentTypeDescriptor.parse(). - * - * @param parsingEngine parsing engine - * @param source source - * @param type type to check - * @param matches matches - * @return the IntervalBinding object. - */ - @Override - public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches) { - return parseBinding(source, type, matches, getArgumentTags(matches)); - } -} - -/** - * Parse simple argument types: java primitives, wrapper classes, and anything that has - * a simple String constructor. - */ -class SimpleArgumentTypeDescriptor extends ArgumentTypeDescriptor { - @Override - public boolean supports( Class type ) { - if ( RodBindingArgumentTypeDescriptor.isRodBinding(type) || IntervalBindingArgumentTypeDescriptor.isIntervalBinding(type) ) return false; - if ( type.isPrimitive() ) return true; - if ( type.isEnum() ) return true; - if ( primitiveToWrapperMap.containsValue(type) ) return true; - - try { - type.getConstructor(String.class); - return true; - } - catch( Exception ex ) { - // An exception thrown above means that the String constructor either doesn't - // exist or can't be accessed. In either case, this descriptor doesn't support this type. - return false; - } - } - - @Override - public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type fulltype, ArgumentMatches matches) { - Class type = makeRawTypeIfNecessary(fulltype); - if (source.isFlag()) - return true; - - ArgumentDefinition defaultDefinition = createDefaultArgumentDefinition(source); - ArgumentMatchValue value = getArgumentValue(defaultDefinition, matches); - Object result; - Tags tags = getArgumentTags(matches); - - // lets go through the types we support - try { - if (type.isPrimitive()) { - Method valueOf = primitiveToWrapperMap.get(type).getMethod("valueOf",String.class); - if(value == null) - throw new MissingArgumentValueException(createDefaultArgumentDefinition(source)); - result = valueOf.invoke(null,value.asString().trim()); - } else if (type.isEnum()) { - Object[] vals = type.getEnumConstants(); - Object defaultEnumeration = null; // as we look at options, record the default option if it exists - for (Object val : vals) { - if (String.valueOf(val).equalsIgnoreCase(value == null ? null : value.asString())) return val; - try { if (type.getField(val.toString()).isAnnotationPresent(EnumerationArgumentDefault.class)) defaultEnumeration = val; } - catch (NoSuchFieldException e) { throw new ReviewedStingException("parsing " + type.toString() + "doesn't contain the field " + val.toString()); } - } - // if their argument has no value (null), and there's a default, return that default for the enum value - if (defaultEnumeration != null && value == null) - result = defaultEnumeration; - // if their argument has no value and there's no default, throw a missing argument value exception. - // TODO: Clean this up so that null values never make it to this point. To fix this, we'll have to clean up the implementation of -U. - else if (value == null) - throw new MissingArgumentValueException(createDefaultArgumentDefinition(source)); - else - throw new UnknownEnumeratedValueException(createDefaultArgumentDefinition(source),value.asString()); - } else if (type.equals(File.class)) { - result = value == null ? null : value.asFile(); - } else { - Constructor ctor = type.getConstructor(String.class); - result = ctor.newInstance(value == null ? null : value.asString()); - } - } catch (UserException e) { - throw e; - } catch (InvocationTargetException e) { - throw new UserException.CommandLineException(String.format("Failed to parse value %s for argument %s. This is most commonly caused by providing an incorrect data type (e.g. a double when an int is required)", - value, source.field.getName())); - } catch (Exception e) { - throw new DynamicClassResolutionException(String.class, e); - } - - // TODO FIXME! - - // WARNING: Side effect! - parsingEngine.addTags(result,tags); - - return result; - } - - - /** - * A mapping of the primitive types to their associated wrapper classes. Is there really no way to infer - * this association available in the JRE? - */ - private static Map primitiveToWrapperMap = new HashMap() { - { - put( Boolean.TYPE, Boolean.class ); - put( Character.TYPE, Character.class ); - put( Byte.TYPE, Byte.class ); - put( Short.TYPE, Short.class ); - put( Integer.TYPE, Integer.class ); - put( Long.TYPE, Long.class ); - put( Float.TYPE, Float.class ); - put( Double.TYPE, Double.class ); - } - }; -} - -/** - * Process compound argument types: arrays, and typed and untyped collections. - */ -class CompoundArgumentTypeDescriptor extends ArgumentTypeDescriptor { - @Override - public boolean supports( Class type ) { - return ( Collection.class.isAssignableFrom(type) || type.isArray() ); - } - - @Override - @SuppressWarnings("unchecked") - public Object parse(ParsingEngine parsingEngine,ArgumentSource source, Type fulltype, ArgumentMatches matches) { - Class type = makeRawTypeIfNecessary(fulltype); - Type componentType; - Object result; - - if( Collection.class.isAssignableFrom(type) ) { - - // If this is a generic interface, pick a concrete implementation to create and pass back. - // Because of type erasure, don't worry about creating one of exactly the correct type. - if( Modifier.isInterface(type.getModifiers()) || Modifier.isAbstract(type.getModifiers()) ) - { - if( java.util.List.class.isAssignableFrom(type) ) type = ArrayList.class; - else if( java.util.Queue.class.isAssignableFrom(type) ) type = java.util.ArrayDeque.class; - else if( java.util.Set.class.isAssignableFrom(type) ) type = java.util.TreeSet.class; - } - - componentType = getCollectionComponentType( source.field ); - ArgumentTypeDescriptor componentArgumentParser = parsingEngine.selectBestTypeDescriptor(makeRawTypeIfNecessary(componentType)); - - Collection collection; - try { - collection = (Collection)type.newInstance(); - } - catch (InstantiationException e) { - logger.fatal("ArgumentParser: InstantiationException: cannot convert field " + source.field.getName()); - throw new ReviewedStingException("constructFromString:InstantiationException: Failed conversion " + e.getMessage()); - } - catch (IllegalAccessException e) { - logger.fatal("ArgumentParser: IllegalAccessException: cannot convert field " + source.field.getName()); - throw new ReviewedStingException("constructFromString:IllegalAccessException: Failed conversion " + e.getMessage()); - } - - for( ArgumentMatch match: matches ) { - for( ArgumentMatch value: match ) { - Object object = componentArgumentParser.parse(parsingEngine,source,componentType,new ArgumentMatches(value)); - collection.add( object ); - // WARNING: Side effect! - parsingEngine.addTags(object,value.tags); - } - } - - result = collection; - - } - else if( type.isArray() ) { - componentType = type.getComponentType(); - ArgumentTypeDescriptor componentArgumentParser = parsingEngine.selectBestTypeDescriptor(makeRawTypeIfNecessary(componentType)); - - // Assemble a collection of individual values used in this computation. - Collection values = new ArrayList(); - for( ArgumentMatch match: matches ) - for( ArgumentMatch value: match ) - values.add(value); - - result = Array.newInstance(makeRawTypeIfNecessary(componentType),values.size()); - - int i = 0; - for( ArgumentMatch value: values ) { - Object object = componentArgumentParser.parse(parsingEngine,source,componentType,new ArgumentMatches(value)); - Array.set(result,i++,object); - // WARNING: Side effect! - parsingEngine.addTags(object,value.tags); - } - } - else - throw new ReviewedStingException("Unsupported compound argument type: " + type); - - return result; - } - - /** - * Return the component type of a field, or String.class if the type cannot be found. - * @param field The reflected field to inspect. - * @return The parameterized component type, or String.class if the parameterized type could not be found. - * @throws IllegalArgumentException If more than one parameterized type is found on the field. - */ - @Override - protected Type getCollectionComponentType( Field field ) { - // If this is a parameterized collection, find the contained type. If blow up if more than one type exists. - if( field.getGenericType() instanceof ParameterizedType) { - ParameterizedType parameterizedType = (ParameterizedType)field.getGenericType(); - if( parameterizedType.getActualTypeArguments().length > 1 ) - throw new IllegalArgumentException("Unable to determine collection type of field: " + field.toString()); - return parameterizedType.getActualTypeArguments()[0]; - } - else - return String.class; - } -} - -class MultiplexArgumentTypeDescriptor extends ArgumentTypeDescriptor { - /** - * The multiplexer controlling how data is split. - */ - private final Multiplexer multiplexer; - - /** - * The set of identifiers for the multiplexed entries. - */ - private final Collection multiplexedIds; - - public MultiplexArgumentTypeDescriptor() { - this.multiplexer = null; - this.multiplexedIds = null; - } - - /** - * Private constructor to use in creating a closure of the MultiplexArgumentTypeDescriptor specific to the - * given set of multiplexed ids. - * @param multiplexedIds The collection of multiplexed entries - */ - private MultiplexArgumentTypeDescriptor(final Multiplexer multiplexer, final Collection multiplexedIds) { - this.multiplexer = multiplexer; - this.multiplexedIds = multiplexedIds; - } - - @Override - public boolean supports( Class type ) { - return ( Map.class.isAssignableFrom(type) ); - } - - @Override - public boolean createsTypeDefault(ArgumentSource source) { - // Multiplexing always creates a type default. - return true; - } - - @Override - public Object createTypeDefault(ParsingEngine parsingEngine,ArgumentSource source, Type type) { - if(multiplexer == null || multiplexedIds == null) - throw new ReviewedStingException("No multiplexed ids available"); - - Map multiplexedMapping = new HashMap(); - Class componentType = makeRawTypeIfNecessary(getCollectionComponentType(source.field)); - ArgumentTypeDescriptor componentTypeDescriptor = parsingEngine.selectBestTypeDescriptor(componentType); - - for(Object id: multiplexedIds) { - Object value = null; - if(componentTypeDescriptor.createsTypeDefault(source)) - value = componentTypeDescriptor.createTypeDefault(parsingEngine,source,componentType); - multiplexedMapping.put(id,value); - } - return multiplexedMapping; - } - - @Override - public String typeDefaultDocString(ArgumentSource source) { - return "None"; - } - - @Override - public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches) { - if(multiplexedIds == null) - throw new ReviewedStingException("Cannot directly parse a MultiplexArgumentTypeDescriptor; must create a derivative type descriptor first."); - - Map multiplexedMapping = new HashMap(); - - Class componentType = makeRawTypeIfNecessary(getCollectionComponentType(source.field)); - - - for(Object id: multiplexedIds) { - Object value = parsingEngine.selectBestTypeDescriptor(componentType).parse(parsingEngine,source,componentType,matches.transform(multiplexer,id)); - multiplexedMapping.put(id,value); - } - - parsingEngine.addTags(multiplexedMapping,getArgumentTags(matches)); - - return multiplexedMapping; - } - - public MultiplexArgumentTypeDescriptor createCustomTypeDescriptor(ParsingEngine parsingEngine,ArgumentSource dependentArgument,Object containingObject) { - String[] sourceFields = dependentArgument.field.getAnnotation(Multiplex.class).arguments(); - - List allSources = parsingEngine.extractArgumentSources(containingObject.getClass()); - Class[] sourceTypes = new Class[sourceFields.length]; - Object[] sourceValues = new Object[sourceFields.length]; - int currentField = 0; - - for(String sourceField: sourceFields) { - boolean fieldFound = false; - for(ArgumentSource source: allSources) { - if(!source.field.getName().equals(sourceField)) - continue; - if(source.field.isAnnotationPresent(Multiplex.class)) - throw new ReviewedStingException("Command-line arguments can only depend on independent fields"); - sourceTypes[currentField] = source.field.getType(); - sourceValues[currentField] = JVMUtils.getFieldValue(source.field,containingObject); - currentField++; - fieldFound = true; - } - if(!fieldFound) - throw new ReviewedStingException(String.format("Unable to find source field %s, referred to by dependent field %s",sourceField,dependentArgument.field.getName())); - } - - Class multiplexerType = dependentArgument.field.getAnnotation(Multiplex.class).value(); - Constructor multiplexerConstructor; - try { - multiplexerConstructor = multiplexerType.getConstructor(sourceTypes); - multiplexerConstructor.setAccessible(true); - } - catch(NoSuchMethodException ex) { - throw new ReviewedStingException(String.format("Unable to find constructor for class %s with parameters %s",multiplexerType.getName(),Arrays.deepToString(sourceFields)),ex); - } - - Multiplexer multiplexer; - try { - multiplexer = multiplexerConstructor.newInstance(sourceValues); - } - catch(IllegalAccessException ex) { - throw new ReviewedStingException(String.format("Constructor for class %s with parameters %s is inaccessible",multiplexerType.getName(),Arrays.deepToString(sourceFields)),ex); - } - catch(InstantiationException ex) { - throw new ReviewedStingException(String.format("Can't create class %s with parameters %s",multiplexerType.getName(),Arrays.deepToString(sourceFields)),ex); - } - catch(InvocationTargetException ex) { - throw new ReviewedStingException(String.format("Can't invoke constructor of class %s with parameters %s",multiplexerType.getName(),Arrays.deepToString(sourceFields)),ex); - } - - return new MultiplexArgumentTypeDescriptor(multiplexer,multiplexer.multiplex()); - } - - /** - * Return the component type of a field, or String.class if the type cannot be found. - * @param field The reflected field to inspect. - * @return The parameterized component type, or String.class if the parameterized type could not be found. - * @throws IllegalArgumentException If more than one parameterized type is found on the field. - */ - @Override - protected Type getCollectionComponentType( Field field ) { - // Multiplex arguments must resolve to maps from which the clp should extract the second type. - if( field.getGenericType() instanceof ParameterizedType) { - ParameterizedType parameterizedType = (ParameterizedType)field.getGenericType(); - if( parameterizedType.getActualTypeArguments().length != 2 ) - throw new IllegalArgumentException("Unable to determine collection type of field: " + field.toString()); - return (Class)parameterizedType.getActualTypeArguments()[1]; - } - else - return String.class; - } -} diff --git a/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java b/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java deleted file mode 100644 index f00bd0ad6..000000000 --- a/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java +++ /dev/null @@ -1,444 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.commandline; - -import org.apache.log4j.FileAppender; -import org.apache.log4j.Level; -import org.apache.log4j.Logger; -import org.apache.log4j.PatternLayout; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.help.ApplicationDetails; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.sting.utils.help.HelpFormatter; - -import java.io.IOException; -import java.util.*; - -public abstract class CommandLineProgram { - - /** The command-line program and the arguments it returned. */ - public ParsingEngine parser = null; - - /** the default log level */ - @Argument(fullName = "logging_level", - shortName = "l", - doc = "Set the minimum level of logging, i.e. setting INFO get's you INFO up to FATAL, setting ERROR gets you ERROR and FATAL level logging.", - required = false) - protected String logging_level = "INFO"; - - - /** where to send the output of our logger */ - @Output(fullName = "log_to_file", - shortName = "log", - doc = "Set the logging location", - required = false) - protected String toFile = null; - - /** this is used to indicate if they've asked for help */ - @Argument(fullName = "help", shortName = "h", doc = "Generate this help message", required = false) - public Boolean help = false; - - /** This is used to indicate if they've asked for the version information */ - @Argument(fullName = "version", shortName = "version", doc ="Output version information", required = false) - public Boolean version = false; - - - /** our logging output patterns */ - private static final String patternString = "%-5p %d{HH:mm:ss,SSS} %C{1} - %m %n"; - - static { - /** - * The very first thing that any Sting application does is forces the JVM locale into US English, so that we don't have - * to think about number formatting issues. - */ - forceJVMLocaleToUSEnglish(); - // setup a basic log configuration - CommandLineUtils.configureConsoleLogging(); - } - - - /** - * Allows a given application to return a brief description of itself. - * - * @return An ApplicationDetails object describing the current application. Should not be null. - */ - protected ApplicationDetails getApplicationDetails() { - return new ApplicationDetails(ApplicationDetails.createDefaultHeader(getClass()), - Collections.emptyList(), - ApplicationDetails.createDefaultRunningInstructions(getClass()), - null); - } - - /** - * Subclasses of CommandLinePrograms can provide their own types of command-line arguments. - * @return A collection of type descriptors generating implementation-dependent placeholders. - */ - protected Collection getArgumentTypeDescriptors() { - return Collections.emptyList(); - } - - /** - * Will this application want to vary its argument list dynamically? - * If so, parse the command-line options and then prompt the subclass to return - * a list of argument providers. - * - * @return Whether the application should vary command-line arguments dynamically. - */ - protected boolean canAddArgumentsDynamically() { return false; } - - /** - * Provide a list of object to inspect, looking for additional command-line arguments. - * - * @return A list of objects to inspect. - */ - protected Class[] getArgumentSources() { - return new Class[]{}; - } - - /** - * Name this argument source. Provides the (full) class name as a default. - * - * @param source The argument source. - * - * @return a name for the argument source. - */ - protected String getArgumentSourceName( Class source ) { return source.toString(); } - - /** - * Sets the command-line parsing engine. Necessary for unit testing purposes. - * @param parser the new command-line parsing engine - */ - public void setParser( ParsingEngine parser ) { - this.parser = parser; - } - - /** - * this is the function that the inheriting class can expect to have called - * when all the argument processing is done - * - * @return the return code to exit the program with - * @throws Exception when an exception occurs - */ - protected abstract int execute() throws Exception; - - public static int result = -1; - - @SuppressWarnings("unchecked") - public static void start(CommandLineProgram clp, String[] args) throws Exception { - start(clp, args, false); - } - - /** - * This function is called to start processing the command line, and kick - * off the execute message of the program. - * - * @param clp the command line program to execute - * @param args the command line arguments passed in - * @param dryRun dry run - * @throws Exception when an exception occurs - */ - @SuppressWarnings("unchecked") - public static void start(CommandLineProgram clp, String[] args, boolean dryRun) throws Exception { - - try { - // setup our log layout - PatternLayout layout = new PatternLayout(); - - Logger logger = CommandLineUtils.getStingLogger(); - - // now set the layout of all the loggers to our layout - CommandLineUtils.setLayout(logger, layout); - - // Initialize the logger using the defaults. - clp.setupLoggerLevel(layout); - - // setup the parser - ParsingEngine parser = clp.parser = new ParsingEngine(clp); - parser.addArgumentSource(clp.getClass()); - - Map parsedArgs; - - // process the args - if (clp.canAddArgumentsDynamically()) { - // if the command-line program can toss in extra args, fetch them and reparse the arguments. - parser.parse(args); - - // Allow invalid and missing required arguments to pass this validation step. - // - InvalidArgument in case these arguments are specified by plugins. - // - MissingRequiredArgument in case the user requested help. Handle that later, once we've - // determined the full complement of arguments. - if ( ! dryRun ) - parser.validate(EnumSet.of(ParsingEngine.ValidationType.MissingRequiredArgument, - ParsingEngine.ValidationType.InvalidArgument)); - parser.loadArgumentsIntoObject(clp); - - // Initialize the logger using the loaded command line. - clp.setupLoggerLevel(layout); - - Class[] argumentSources = clp.getArgumentSources(); - for (Class argumentSource : argumentSources) - parser.addArgumentSource(clp.getArgumentSourceName(argumentSource), argumentSource); - parsedArgs = parser.parse(args); - - if (isVersionPresent(parser)) - printVersionAndExit(); - - if (isHelpPresent(parser)) - printHelpAndExit(clp, parser); - - if ( ! dryRun ) parser.validate(); - } else { - parsedArgs = parser.parse(args); - - if ( ! dryRun ) { - if (isHelpPresent(parser)) - printHelpAndExit(clp, parser); - - parser.validate(); - } - parser.loadArgumentsIntoObject(clp); - - // Initialize the logger using the loaded command line. - clp.setupLoggerLevel(layout); - } - - if ( ! dryRun ) { - // if they specify a log location, output our data there - if (clp.toFile != null) { - FileAppender appender; - try { - appender = new FileAppender(layout, clp.toFile, false); - logger.addAppender(appender); - } catch (IOException e) { - throw new RuntimeException("Unable to re-route log output to " + clp.toFile + " make sure the destination exists"); - } - } - - // regardless of what happens next, generate the header information - HelpFormatter.generateHeaderInformation(clp.getApplicationDetails(), parsedArgs); - - // call the execute - CommandLineProgram.result = clp.execute(); - } - } - catch (ArgumentException e) { - //clp.parser.printHelp(clp.getApplicationDetails()); - // Rethrow the exception to exit with an error. - throw e; - } - } - - /** - * Find fields in the object obj that look like command-line arguments, and put command-line - * arguments into them. - * - * @param obj Object to inspect for command line arguments. - */ - public void loadArgumentsIntoObject(Object obj) { - parser.loadArgumentsIntoObject(obj); - } - - /** - * this function checks the logger level passed in on the command line, taking the lowest - * level that was provided. - * @param layout Pattern layout to format based on the logger level. - */ - private void setupLoggerLevel(PatternLayout layout) { - layout.setConversionPattern(patternString); - - // set the default logger level - Level par; - if (logging_level.toUpperCase().equals("DEBUG")) { - par = Level.DEBUG; - } else if (logging_level.toUpperCase().equals("ERROR")) { - par = Level.ERROR; - } else if (logging_level.toUpperCase().equals("FATAL")) { - par = Level.FATAL; - } else if (logging_level.toUpperCase().equals("INFO")) { - par = Level.INFO; - } else if (logging_level.toUpperCase().equals("WARN")) { - par = Level.WARN; - } else if (logging_level.toUpperCase().equals("OFF")) { - par = Level.OFF; - } else { - // we don't understand the logging level, let's get out of here - throw new ArgumentException("Unable to match: " + logging_level + " to a logging level, make sure it's a valid level (INFO, DEBUG, ERROR, FATAL, OFF)"); - } - - Logger.getRootLogger().setLevel(par); - } - - /** - * a function used to indicate an error occurred in the command line tool - */ - private static void printDocumentationReference() { - errorPrintf("Visit our website and forum for extensive documentation and answers to %n"); - errorPrintf("commonly asked questions " + HelpConstants.BASE_GATK_URL + "%n"); - } - - - /** - * Do a cursory search for the given argument. - * - * @param parser Parser - * - * @return True if help is present; false otherwise. - */ - private static boolean isHelpPresent(ParsingEngine parser) { - return parser.isArgumentPresent("help"); - } - - /** - * Print help and exit. - * - * @param clp Instance of the command-line program. - * @param parser True if help is present; false otherwise. - */ - private static void printHelpAndExit(CommandLineProgram clp, ParsingEngine parser) { - parser.printHelp(clp.getApplicationDetails()); - System.exit(0); - } - - /** - * Do a cursory search for the argument "version". - * - * @param parser Parser - * - * @return True if version is present; false otherwise. - */ - private static boolean isVersionPresent(ParsingEngine parser) { - return parser.isArgumentPresent("version"); - } - - /** - * Print help and exit. - */ - private static void printVersionAndExit() { - System.out.println(CommandLineGATK.getVersionNumber().toString()); - System.exit(0); - } - - - private static void errorPrintf(String format, Object... s) { - String formatted = String.format(format, s); - - if ( formatted.trim().equals("") ) - System.err.println("##### ERROR"); - else { - for ( String part : formatted.split("\n") ) { - System.err.println("##### ERROR " + part); - } - } - } - - - /** - * used to indicate an error occured - * - * @param msg the message - * @param t the error - */ - public static void exitSystemWithError(String msg, final Throwable t) { - errorPrintf("------------------------------------------------------------------------------------------%n"); - errorPrintf("stack trace %n"); - t.printStackTrace(); - - errorPrintf("------------------------------------------------------------------------------------------%n"); - errorPrintf("A GATK RUNTIME ERROR has occurred (version %s):%n", CommandLineGATK.getVersionNumber()); - errorPrintf("%n"); - errorPrintf("This might be a bug. Please check the documentation guide to see if this is a known problem.%n"); - errorPrintf("If not, please post the error message, with stack trace, to the GATK forum.%n"); - printDocumentationReference(); - if ( msg == null ) // some exceptions don't have detailed messages - msg = "Code exception (see stack trace for error itself)"; - errorPrintf("%n"); - errorPrintf("MESSAGE: %s%n", msg.trim()); - errorPrintf("------------------------------------------------------------------------------------------%n"); - System.exit(1); - } - - public static void exitSystemWithUserError(final Exception e) { - if ( e.getMessage() == null ) - throw new ReviewedStingException("UserException found with no message!", e); - - errorPrintf("------------------------------------------------------------------------------------------%n"); - errorPrintf("A USER ERROR has occurred (version %s): %n", CommandLineGATK.getVersionNumber()); - errorPrintf("%n"); - errorPrintf("This means that one or more arguments or inputs in your command are incorrect.%n"); - errorPrintf("The error message below tells you what is the problem.%n"); - errorPrintf("%n"); - errorPrintf("If the problem is an invalid argument, please check the online documentation guide%n"); - errorPrintf("(or rerun your command with --help) to view allowable command-line arguments for this tool.%n"); - errorPrintf("%n"); - printDocumentationReference(); - errorPrintf("%n"); - errorPrintf("Please do NOT post this error to the GATK forum unless you have really tried to fix it yourself.%n"); - errorPrintf("%n"); - errorPrintf("MESSAGE: %s%n", e.getMessage().trim()); - errorPrintf("------------------------------------------------------------------------------------------%n"); - System.exit(1); - } - - public static void exitSystemWithSamError(final Throwable t) { - if ( t.getMessage() == null ) - throw new ReviewedStingException("SamException found with no message!", t); - - errorPrintf("------------------------------------------------------------------------------------------%n"); - errorPrintf("A BAM ERROR has occurred (version %s): %n", CommandLineGATK.getVersionNumber()); - errorPrintf("%n"); - errorPrintf("This means that there is something wrong with the BAM file(s) you provided.%n"); - errorPrintf("The error message below tells you what is the problem.%n"); - errorPrintf("%n"); - printDocumentationReference(); - errorPrintf("%n"); - errorPrintf("Please do NOT post this error to the GATK forum until you have followed these instructions:%n"); - errorPrintf("- Make sure that your BAM file is well-formed by running Picard's validator on it%n"); - errorPrintf("(see http://picard.sourceforge.net/command-line-overview.shtml#ValidateSamFile for details)%n"); - errorPrintf("- Ensure that your BAM index is not corrupted: delete the current one and regenerate it with 'samtools index'%n"); - errorPrintf("%n"); - errorPrintf("MESSAGE: %s%n", t.getMessage().trim()); - errorPrintf("------------------------------------------------------------------------------------------%n"); - System.exit(1); - } - - - /** - * used to indicate an error occured - * - * @param t the exception that occurred - */ - public static void exitSystemWithError(Throwable t) { - exitSystemWithError(t.getMessage(), t); - } - - /** - * A hack to ensure that numbers are always formatted in the US style. - */ - protected static void forceJVMLocaleToUSEnglish() { - Locale.setDefault(Locale.US); - } -} diff --git a/public/java/src/org/broadinstitute/sting/commandline/IntervalArgumentCollection.java b/public/java/src/org/broadinstitute/sting/commandline/IntervalArgumentCollection.java deleted file mode 100644 index b491c9f3d..000000000 --- a/public/java/src/org/broadinstitute/sting/commandline/IntervalArgumentCollection.java +++ /dev/null @@ -1,70 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.commandline; - -import org.broad.tribble.Feature; -import org.broadinstitute.sting.utils.interval.IntervalMergingRule; -import org.broadinstitute.sting.utils.interval.IntervalSetRule; - -import java.util.List; - -public class IntervalArgumentCollection { - /** - * Using this option one can instruct the GATK engine to traverse over only part of the genome. This argument can be specified multiple times. - * One may use samtools-style intervals either explicitly (e.g. -L chr1 or -L chr1:100-200) or listed in a file (e.g. -L myFile.intervals). - * Additionally, one may specify a rod file to traverse over the positions for which there is a record in the file (e.g. -L file.vcf). - * To specify the completely unmapped reads in the BAM file (i.e. those without a reference contig) use -L unmapped. - */ - @Input(fullName = "intervals", shortName = "L", doc = "One or more genomic intervals over which to operate. Can be explicitly specified on the command line or in a file (including a rod file)", required = false) - public List> intervals = null; - - /** - * Using this option one can instruct the GATK engine NOT to traverse over certain parts of the genome. This argument can be specified multiple times. - * One may use samtools-style intervals either explicitly (e.g. -XL chr1 or -XL chr1:100-200) or listed in a file (e.g. -XL myFile.intervals). - * Additionally, one may specify a rod file to skip over the positions for which there is a record in the file (e.g. -XL file.vcf). - */ - @Input(fullName = "excludeIntervals", shortName = "XL", doc = "One or more genomic intervals to exclude from processing. Can be explicitly specified on the command line or in a file (including a rod file)", required = false) - public List> excludeIntervals = null; - - /** - * How should the intervals specified by multiple -L or -XL arguments be combined? Using this argument one can, for example, traverse over all of the positions - * for which there is a record in a VCF but just in chromosome 20 (-L chr20 -L file.vcf -isr INTERSECTION). - */ - @Argument(fullName = "interval_set_rule", shortName = "isr", doc = "Indicates the set merging approach the interval parser should use to combine the various -L or -XL inputs", required = false) - public IntervalSetRule intervalSetRule = IntervalSetRule.UNION; - - /** - * Should abutting (but not overlapping) intervals be treated as separate intervals? - */ - @Argument(fullName = "interval_merging", shortName = "im", doc = "Indicates the interval merging rule we should use for abutting intervals", required = false) - public IntervalMergingRule intervalMerging = IntervalMergingRule.ALL; - - /** - * For example, '-L chr1:100' with a padding value of 20 would turn into '-L chr1:80-120'. - */ - @Argument(fullName = "interval_padding", shortName = "ip", doc = "Indicates how many basepairs of padding to include around each of the intervals specified with the -L/--intervals argument", required = false) - public int intervalPadding = 0; -} diff --git a/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java b/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java deleted file mode 100644 index 9253e1ee5..000000000 --- a/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java +++ /dev/null @@ -1,108 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.commandline; - -import com.google.java.contract.Requires; -import org.broad.tribble.AbstractFeatureReader; -import org.broad.tribble.Feature; -import org.broad.tribble.FeatureCodec; -import org.broad.tribble.FeatureReader; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; -import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.interval.IntervalUtils; - -import java.util.*; - -/** - * An IntervalBinding representing a walker argument that gets bound to either a ROD track or interval string. - * - * The IntervalBinding is a formal GATK argument that bridges between a walker and - * the engine to construct intervals for traversal at runtime. The IntervalBinding can - * either be a RodBinding, a string of one interval, or a file with interval strings. - * The GATK Engine takes care of initializing the binding when appropriate and determining intervals from it. - * - * Note that this class is immutable. - */ -public final class IntervalBinding { - - private RodBinding featureIntervals; - private String stringIntervals; - - @Requires({"type != null", "rawName != null", "source != null", "tribbleType != null", "tags != null"}) - public IntervalBinding(Class type, final String rawName, final String source, final String tribbleType, final Tags tags) { - featureIntervals = new RodBinding(type, rawName, source, tribbleType, tags); - } - - @Requires({"intervalArgument != null"}) - public IntervalBinding(String intervalArgument) { - stringIntervals = intervalArgument; - } - - public String getSource() { - if ( featureIntervals != null ) - return featureIntervals.getSource(); - return stringIntervals; - } - - public List getIntervals(final GenomeAnalysisEngine toolkit) { - return getIntervals(toolkit.getGenomeLocParser()); - } - - public List getIntervals(final GenomeLocParser genomeLocParser) { - List intervals; - - if ( featureIntervals != null ) { - intervals = new ArrayList(); - - // TODO -- after ROD system cleanup, go through the ROD system so that we can handle things like gzipped files - - final FeatureCodec codec = new FeatureManager().getByName(featureIntervals.getTribbleType()).getCodec(); - if ( codec instanceof ReferenceDependentFeatureCodec ) - ((ReferenceDependentFeatureCodec)codec).setGenomeLocParser(genomeLocParser); - try { - FeatureReader reader = AbstractFeatureReader.getFeatureReader(featureIntervals.getSource(), codec, false); - for ( Feature feature : reader.iterator() ) - intervals.add(genomeLocParser.createGenomeLoc(feature)); - } catch (Exception e) { - throw new UserException.MalformedFile(featureIntervals.getSource(), "Problem reading the interval file", e); - } - - } else { - intervals = IntervalUtils.parseIntervalArguments(genomeLocParser, stringIntervals); - } - - Collections.sort(intervals); - return intervals; - } - - public String toString() { - return getSource(); - } -} diff --git a/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java b/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java deleted file mode 100644 index aca20d5a1..000000000 --- a/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java +++ /dev/null @@ -1,828 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.commandline; - -import com.google.java.contract.Requires; -import org.apache.commons.io.FileUtils; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.classloader.JVMUtils; -import org.broadinstitute.sting.utils.classloader.PluginManager; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.help.ApplicationDetails; -import org.broadinstitute.sting.utils.help.HelpFormatter; - -import java.io.File; -import java.io.IOException; -import java.lang.annotation.Annotation; -import java.lang.reflect.Field; -import java.util.*; - -/** - * A parser for Sting command-line arguments. - */ -public class ParsingEngine { - - /** - * The loaded argument sources along with their back definitions. - */ - private Map argumentSourcesByDefinition = new HashMap(); - - /** - * A list of defined arguments against which command lines are matched. - * Package protected for testing access. - */ - public ArgumentDefinitions argumentDefinitions = new ArgumentDefinitions(); - - /** - * A list of matches from defined arguments to command-line text. - * Indicates as best as possible where command-line text remains unmatched - * to existing arguments. - */ - private ArgumentMatches argumentMatches = null; - - /** - * Techniques for parsing and for argument lookup. - */ - private List parsingMethods = new ArrayList(); - - /** - * All of the RodBinding objects we've seen while parsing - */ - private List rodBindings = new ArrayList(); - - /** - * Class reference to the different types of descriptors that the create method can create. - * The type of set used must be ordered (but not necessarily sorted). - */ - private static final Set STANDARD_ARGUMENT_TYPE_DESCRIPTORS = new LinkedHashSet( Arrays.asList(new SimpleArgumentTypeDescriptor(), - new IntervalBindingArgumentTypeDescriptor(), - new RodBindingArgumentTypeDescriptor(), - new CompoundArgumentTypeDescriptor(), - new MultiplexArgumentTypeDescriptor()) ); - - private Set argumentTypeDescriptors = new LinkedHashSet(); - - /** - * List of tags associated with the given instantiation of the command-line argument. - */ - private final Map tags = new IdentityHashMap(); - - private PluginManager argumentProviderPluginManager = - new PluginManager(ParsingEngineArgumentProvider.class); - - /** - * our log, which we want to capture anything from org.broadinstitute.sting - */ - protected static Logger logger = Logger.getLogger(ParsingEngine.class); - - public ParsingEngine( CommandLineProgram clp ) { - RodBinding.resetNameCounter(); - parsingMethods.add( ParsingMethod.FullNameParsingMethod ); - parsingMethods.add( ParsingMethod.ShortNameParsingMethod ); - - // Order matters here! Make sure the clp's new type descriptors go in before the original type descriptors. - if(clp != null) - argumentTypeDescriptors.addAll(clp.getArgumentTypeDescriptors()); - argumentTypeDescriptors.addAll(STANDARD_ARGUMENT_TYPE_DESCRIPTORS); - - List> providers = argumentProviderPluginManager.getPlugins(); - for (Class provider: providers) { - addArgumentSource(provider); - } - } - - /** - * Add a main argument source. Argument sources are expected to have - * any number of fields with an @Argument annotation attached. - * @param source An argument source from which to extract command-line arguments. - */ - public void addArgumentSource( Class source ) { - addArgumentSource(null, source); - } - - public ArgumentMatches getArgumentMatches() { - return argumentMatches; - } - - /** - * Add an argument source. Argument sources are expected to have - * any number of fields with an @Argument annotation attached. - * @param sourceName name for this argument source. 'Null' indicates that this source should be treated - * as the main module. - * @param sourceClass A class containing argument sources from which to extract command-line arguments. - */ - public void addArgumentSource( String sourceName, Class sourceClass ) { - List argumentsFromSource = new ArrayList(); - for( ArgumentSource argumentSource: extractArgumentSources(sourceClass) ) { - List argumentDefinitions = argumentSource.createArgumentDefinitions(); - for(ArgumentDefinition argumentDefinition: argumentDefinitions) { - argumentSourcesByDefinition.put(argumentDefinition,argumentSource); - argumentsFromSource.add( argumentDefinition ); - } - } - argumentDefinitions.add( new ArgumentDefinitionGroup(sourceName, argumentsFromSource) ); - } - - /** - * Do a cursory search to see if an argument with the given name is present. - * @param argumentFullName full name of the argument. - * @return True if the argument is present. False otherwise. - */ - public boolean isArgumentPresent( String argumentFullName ) { - ArgumentDefinition definition = - argumentDefinitions.findArgumentDefinition(argumentFullName,ArgumentDefinitions.FullNameDefinitionMatcher); - return argumentMatches.hasMatch(definition); - - } - - /** - * Parse the given set of command-line arguments, returning - * an ArgumentMatches object describing the best fit of these - * command-line arguments to the arguments that are actually - * required. - * @param tokens Tokens passed on the command line. - * @return The parsed arguments by file. - */ - public SortedMap parse( String[] tokens ) { - argumentMatches = new ArgumentMatches(); - SortedMap parsedArgs = new TreeMap(); - - List cmdLineTokens = Arrays.asList(tokens); - parse(ArgumentMatchSource.COMMAND_LINE, cmdLineTokens, argumentMatches, parsedArgs); - - List providers = argumentProviderPluginManager.createAllTypes(); - - for (ParsingEngineArgumentProvider provider: providers) { - // Load the arguments ONLY into the provider. - // Validation may optionally run on the rest of the arguments. - loadArgumentsIntoObject(provider); - } - - for (ParsingEngineArgumentProvider provider: providers) { - provider.parse(this, parsedArgs); - } - - return parsedArgs; - } - - public void parse(ArgumentMatchSource matchSource, List tokens, - ArgumentMatches argumentMatches, SortedMap parsedArgs) { - ArgumentMatchSite lastArgumentMatchSite = new ArgumentMatchSite(matchSource, -1); - - int i = 0; - for (String token: tokens) { - // If the token is of argument form, parse it into its own argument match. - // Otherwise, pair it with the most recently used argument discovered. - ArgumentMatchSite site = new ArgumentMatchSite(matchSource, i); - if( isArgumentForm(token) ) { - ArgumentMatch argumentMatch = parseArgument( token, site ); - if( argumentMatch != null ) { - argumentMatches.mergeInto( argumentMatch ); - lastArgumentMatchSite = site; - } - } - else { - if( argumentMatches.hasMatch(lastArgumentMatchSite) && - !argumentMatches.getMatch(lastArgumentMatchSite).hasValueAtSite(lastArgumentMatchSite)) - argumentMatches.getMatch(lastArgumentMatchSite).addValue( lastArgumentMatchSite, new ArgumentMatchStringValue(token) ); - else - argumentMatches.MissingArgument.addValue( site, new ArgumentMatchStringValue(token) ); - - } - i++; - } - - parsedArgs.put(matchSource, new ParsedListArgs(tokens)); - } - - public void parsePairs(ArgumentMatchSource matchSource, List> tokens, - ArgumentMatches argumentMatches, ParsedArgs matchSourceArgs, - SortedMap parsedArgs) { - int i = 0; - for (Pair pair: tokens) { - - ArgumentMatchSite site = new ArgumentMatchSite(matchSource, i); - List matchers = Arrays.asList(ArgumentDefinitions.FullNameDefinitionMatcher, ArgumentDefinitions.ShortNameDefinitionMatcher); - ArgumentDefinition definition = null; - for (DefinitionMatcher matcher: matchers) { - definition = argumentDefinitions.findArgumentDefinition( pair.getFirst(), matcher ); - if (definition != null) - break; - } - if (definition == null) - continue; - ArgumentMatch argumentMatch = new ArgumentMatch(pair.getFirst(), definition, site, new Tags()); - argumentMatches.mergeInto(argumentMatch); - argumentMatch.addValue(site, pair.getSecond()); - i++; - } - - parsedArgs.put(matchSource, matchSourceArgs); - } - - protected List getArguments(File file) { - try { - if (file.getAbsolutePath().endsWith(".list")) { - return getListArguments(file); - } - } catch (IOException e) { - throw new UserException.CouldNotReadInputFile(file, e); - } - throw new UserException.CouldNotReadInputFile(file, "file extension is not .list"); - } - - private List getListArguments(File file) throws IOException { - ArrayList argsList = new ArrayList(); - for (String line: FileUtils.readLines(file)) - argsList.addAll(Arrays.asList(Utils.escapeExpressions(line))); - return argsList; - } - - public enum ValidationType { MissingRequiredArgument, - InvalidArgument, - InvalidArgumentValue, - ValueMissingArgument, - TooManyValuesForArgument, - MutuallyExclusive } - - /** - * Validates the list of command-line argument matches. - */ - public void validate() { - validate( EnumSet.noneOf(ValidationType.class) ); - } - - /** - * Validates the list of command-line argument matches. On failure throws an exception with detailed info about the - * particular failures. Takes an EnumSet indicating which validation checks to skip. - * @param skipValidationOf List of validation checks to skip. - */ - public void validate( EnumSet skipValidationOf ) { - // Find missing required arguments. - if( !skipValidationOf.contains(ValidationType.MissingRequiredArgument) ) { - Collection requiredArguments = - argumentDefinitions.findArgumentDefinitions( true, ArgumentDefinitions.RequiredDefinitionMatcher ); - Collection missingArguments = new ArrayList(); - for( ArgumentDefinition requiredArgument: requiredArguments ) { - if( !argumentMatches.hasMatch(requiredArgument) ) - missingArguments.add( requiredArgument ); - } - - if( missingArguments.size() > 0 ) - throw new MissingArgumentException( missingArguments ); - } - - // Find invalid arguments. Invalid arguments will have a null argument definition. - if( !skipValidationOf.contains(ValidationType.InvalidArgument) ) { - ArgumentMatches invalidArguments = argumentMatches.findUnmatched(); - if( invalidArguments.size() > 0 ) - throw new InvalidArgumentException( invalidArguments ); - } - - // Find invalid argument values -- invalid arguments are either completely missing or fail the specified 'validation' regular expression. - if( !skipValidationOf.contains(ValidationType.InvalidArgumentValue) ) { - Collection verifiableArguments = - argumentDefinitions.findArgumentDefinitions( null, ArgumentDefinitions.VerifiableDefinitionMatcher ); - Collection> invalidValues = new ArrayList>(); - for( ArgumentDefinition verifiableArgument: verifiableArguments ) { - ArgumentMatches verifiableMatches = argumentMatches.findMatches( verifiableArgument ); - // Check to see whether an argument value was specified. Argument values must be provided - // when the argument name is specified and the argument is not a flag type. - for(ArgumentMatch verifiableMatch: verifiableMatches) { - ArgumentSource argumentSource = argumentSourcesByDefinition.get(verifiableArgument); - if(verifiableMatch.values().size() == 0 && !verifiableArgument.isFlag && argumentSource.createsTypeDefault()) - invalidValues.add(new Pair(verifiableArgument,null)); - } - - // Ensure that the field contents meet the validation criteria specified by the regular expression. - for( ArgumentMatch verifiableMatch: verifiableMatches ) { - for( ArgumentMatchValue value: verifiableMatch.values() ) { - if( verifiableArgument.validation != null && !value.asString().matches(verifiableArgument.validation) ) - invalidValues.add( new Pair(verifiableArgument, value.asString()) ); - } - } - } - - if( invalidValues.size() > 0 ) - throw new InvalidArgumentValueException( invalidValues ); - } - - // Find values without an associated mate. - if( !skipValidationOf.contains(ValidationType.ValueMissingArgument) ) { - if( argumentMatches.MissingArgument.values().size() > 0 ) - throw new UnmatchedArgumentException( argumentMatches.MissingArgument ); - } - - // Find arguments with too many values. - if( !skipValidationOf.contains(ValidationType.TooManyValuesForArgument)) { - Collection overvaluedArguments = new ArrayList(); - for( ArgumentMatch argumentMatch: argumentMatches.findSuccessfulMatches() ) { - // Warning: assumes that definition is not null (asserted by checks above). - if( !argumentMatch.definition.isMultiValued && argumentMatch.values().size() > 1 ) - overvaluedArguments.add(argumentMatch); - } - - if( !overvaluedArguments.isEmpty() ) - throw new TooManyValuesForArgumentException(overvaluedArguments); - } - - // Find sets of options that are supposed to be mutually exclusive. - if( !skipValidationOf.contains(ValidationType.MutuallyExclusive)) { - Collection> invalidPairs = new ArrayList>(); - for( ArgumentMatch argumentMatch: argumentMatches.findSuccessfulMatches() ) { - if( argumentMatch.definition.exclusiveOf != null ) { - for( ArgumentMatch conflictingMatch: argumentMatches.findSuccessfulMatches() ) { - // Skip over the current element. - if( argumentMatch == conflictingMatch ) - continue; - if( argumentMatch.definition.exclusiveOf.equals(conflictingMatch.definition.fullName) || - argumentMatch.definition.exclusiveOf.equals(conflictingMatch.definition.shortName)) - invalidPairs.add( new Pair(argumentMatch, conflictingMatch) ); - } - } - } - - if( !invalidPairs.isEmpty() ) - throw new ArgumentsAreMutuallyExclusiveException( invalidPairs ); - } - } - - /** - * Loads a set of matched command-line arguments into the given object. - * @param object Object into which to add arguments. - */ - public void loadArgumentsIntoObject( Object object ) { - loadArgumentsIntoObject(object, true); - } - - /** - * Loads a set of matched command-line arguments into the given object. - * @param object Object into which to add arguments. - * @param enforceArgumentRanges If true, check that the argument value is within the range specified - * in the corresponding Argument annotation by min/max value attributes. This - * check is only performed for numeric types, and only when a min and/or - * max value is actually defined in the annotation. It is also only performed - * for values actually specified on the command line, and not for default values. - */ - public void loadArgumentsIntoObject( Object object, boolean enforceArgumentRanges ) { - List argumentSources = extractArgumentSources(object.getClass()); - - List dependentArguments = new ArrayList(); - - for( ArgumentSource argumentSource: argumentSources ) { - if(argumentSource.isDeprecated() && argumentMatches.findMatches(this,argumentSource).size() > 0) - notifyDeprecatedCommandLineArgument(argumentSource); - - // If this argument source depends on other command-line arguments, skip it and make a note to process it later. - if(argumentSource.isDependent()) { - dependentArguments.add(argumentSource); - continue; - } - loadValueIntoObject(argumentSource, object, argumentMatches.findMatches(this,argumentSource), enforceArgumentRanges); - } - - for(ArgumentSource dependentArgument: dependentArguments) { - MultiplexArgumentTypeDescriptor dependentDescriptor = dependentArgument.createDependentTypeDescriptor(this,object); - ArgumentSource dependentSource = dependentArgument.copyWithCustomTypeDescriptor(dependentDescriptor); - loadValueIntoObject(dependentSource,object,argumentMatches.findMatches(this,dependentSource), enforceArgumentRanges); - } - } - - /** - * Notify the user that tags have been created. - * @param key The key created. - * @param tags List of tags, or empty list if no tags are present. - */ - public void addTags(Object key, final Tags tags) { - this.tags.put(key,tags); - } - - /** - * Gets the tags associated with a given object. - * @param key Key for which to find a tag. - * @return List of tags associated with this key. - */ - public Tags getTags(Object key) { - if(!tags.containsKey(key)) - return new Tags(); - return tags.get(key); - } - - /** - * Add a RodBinding type argument to this parser. Called during parsing to allow - * us to track all of the RodBindings discovered in the command line. - * @param rodBinding the rodbinding to add. Must not be added twice - */ - @Requires("rodBinding != null") - public void addRodBinding(final RodBinding rodBinding) { - rodBindings.add(rodBinding); - } - - /** - * Notify the user that a deprecated command-line argument has been used. - * @param argumentSource Deprecated argument source specified by user. - */ - private void notifyDeprecatedCommandLineArgument(ArgumentSource argumentSource) { - // Grab the first argument definition and report that one as the failure. Theoretically, we should notify of all failures. - List definitions = argumentSource.createArgumentDefinitions(); - if(definitions.size() < 1) - throw new ReviewedStingException("Internal error. Argument source creates no definitions."); - ArgumentDefinition definition = definitions.get(0); - throw new UserException.DeprecatedArgument(definition.fullName,definition.doc); - } - - /** - * Loads a single argument into the object and that objects children. - * @param argumentMatches Argument matches to load into the object. - * @param source Argument source to load into the object. - * @param instance Object into which to inject the value. The target might be in a container within the instance. - * @param enforceArgumentRanges If true, check that the argument value is within the range specified - * in the corresponding Argument annotation by min/max value attributes. This - * check is only performed for numeric types, and only when a min and/or - * max value is actually defined in the annotation. It is also only performed - * for values actually specified on the command line, and not for default values. - */ - private void loadValueIntoObject( ArgumentSource source, Object instance, ArgumentMatches argumentMatches, boolean enforceArgumentRanges ) { - // Nothing to load - if( argumentMatches.size() == 0 && ! source.createsTypeDefault() ) - return; - - // Target instance into which to inject the value. - Collection targets = findTargets( source, instance ); - - // Abort if no home is found for the object. - if( targets.size() == 0 ) - throw new ReviewedStingException("Internal command-line parser error: unable to find a home for argument matches " + argumentMatches); - - for( Object target: targets ) { - Object value; - boolean usedTypeDefault = false; - if ( argumentMatches.size() != 0 ) { - value = source.parse(this,argumentMatches); - } - else { - value = source.createTypeDefault(this); - usedTypeDefault = true; - } - - // Only check argument ranges if a check was requested AND we used a value from the command line rather - // than the type default - if ( enforceArgumentRanges && ! usedTypeDefault ) { - checkArgumentRange(source, value); - } - - JVMUtils.setFieldValue(source.field,target,value); - } - } - - /** - * Check the provided value against any range constraints specified in the Argument annotation - * for the corresponding field. Throw an exception if hard limits are violated, or emit a warning - * if soft limits are violated. - * - * Only checks numeric types (int, double, etc.) - * Only checks fields with an actual @Argument annotation - * Only checks manually-specified constraints (there are no default constraints). - * - * @param argumentSource The source field for the command-line argument - * @param argumentValue The value we're considering putting in that source field - */ - private void checkArgumentRange( final ArgumentSource argumentSource, final Object argumentValue ) { - // Only validate numeric types - if ( ! (argumentValue instanceof Number) ) { - return; - } - final double argumentDoubleValue = ((Number)argumentValue).doubleValue(); - - // Only validate fields with an @Argument annotation - final Annotation argumentAnnotation = argumentSource.field.getAnnotation(Argument.class); - if ( argumentAnnotation == null ) { - return; - } - - final double minValue = (Double)CommandLineUtils.getValue(argumentAnnotation, "minValue"); - final double maxValue = (Double)CommandLineUtils.getValue(argumentAnnotation, "maxValue"); - final double minRecommendedValue = (Double)CommandLineUtils.getValue(argumentAnnotation, "minRecommendedValue"); - final double maxRecommendedValue = (Double)CommandLineUtils.getValue(argumentAnnotation, "maxRecommendedValue"); - final String argumentName = (String)CommandLineUtils.getValue(argumentAnnotation, "fullName"); - - // Check hard limits first, if specified - if ( minValue != Double.NEGATIVE_INFINITY && argumentDoubleValue < minValue ) { - throw new ArgumentValueOutOfRangeException(argumentName, argumentDoubleValue, minValue, "minimum"); - } - - if ( maxValue != Double.POSITIVE_INFINITY && argumentDoubleValue > maxValue ) { - throw new ArgumentValueOutOfRangeException(argumentName, argumentDoubleValue, maxValue, "maximum"); - } - - // Then check soft limits, if specified - if ( minRecommendedValue != Double.NEGATIVE_INFINITY && argumentDoubleValue < minRecommendedValue ) { - logger.warn(String.format("WARNING: argument --%s has value %.2f, but minimum recommended value is %.2f", - argumentName, argumentDoubleValue, minRecommendedValue)); - } - - if ( maxRecommendedValue != Double.POSITIVE_INFINITY && argumentDoubleValue > maxRecommendedValue ) { - logger.warn(String.format("WARNING: argument --%s has value %.2f, but maximum recommended value is %.2f", - argumentName, argumentDoubleValue, maxRecommendedValue)); - } - } - - public Collection getRodBindings() { - return Collections.unmodifiableCollection(rodBindings); - } - - /** - * Gets a collection of the container instances of the given type stored within the given target. - * @param source Argument source. - * @param instance Container. - * @return A collection of containers matching the given argument source. - */ - private Collection findTargets(ArgumentSource source, Object instance) { - LinkedHashSet targets = new LinkedHashSet(); - for( Class clazz = instance.getClass(); clazz != null; clazz = clazz.getSuperclass() ) { - for( Field field: clazz.getDeclaredFields() ) { - if( field.equals(source.field) ) { - targets.add(instance); - } else if( field.isAnnotationPresent(ArgumentCollection.class) ) { - targets.addAll(findTargets(source, JVMUtils.getFieldValue(field, instance))); - } - } - } - return targets; - } - - /** - * Prints out the help associated with these command-line argument definitions. - * @param applicationDetails Details about the specific GATK-based application being run. - */ - public void printHelp( ApplicationDetails applicationDetails ) { - new HelpFormatter().printHelp(applicationDetails,argumentDefinitions); - } - - /** - * Extract all the argument sources from a given object. - * @param sourceClass class to act as sources for other arguments. - * @return A list of sources associated with this object and its aggregated objects. - */ - public List extractArgumentSources(Class sourceClass) { - return extractArgumentSources(sourceClass, new Field[0]); - } - - /** - * Fetch the best command-line argument descriptor for the given class. - * @param type Class for which to specify a descriptor. - * @return descriptor for the given type. - */ - public ArgumentTypeDescriptor selectBestTypeDescriptor(Class type) { - return ArgumentTypeDescriptor.selectBest(argumentTypeDescriptors,type); - } - - private List extractArgumentSources(Class sourceClass, Field[] parentFields) { - // now simply call into the truly general routine extract argument bindings but with a null - // object so bindings aren't computed - Map bindings = extractArgumentBindings(null, sourceClass, parentFields); - return new ArrayList(bindings.keySet()); - } - - public Map extractArgumentBindings(Object obj) { - if ( obj == null ) throw new IllegalArgumentException("Incoming object cannot be null"); - return extractArgumentBindings(obj, obj.getClass(), new Field[0]); - } - - /** - * Extract all the argument sources from a given object, along with their bindings if obj != null . - * @param obj the object corresponding to the sourceClass - * @param sourceClass class to act as sources for other arguments. - * @param parentFields Parent Fields - * @return A map of sources associated with this object and its aggregated objects and bindings to their bindings values - */ - private Map extractArgumentBindings(Object obj, Class sourceClass, Field[] parentFields) { - Map bindings = new LinkedHashMap(); - - while( sourceClass != null ) { - Field[] fields = sourceClass.getDeclaredFields(); - for( Field field: fields ) { - if( ArgumentTypeDescriptor.isArgumentAnnotationPresent(field) ) { - Object val = obj != null ? JVMUtils.getFieldValue(field, obj) : null; - bindings.put( new ArgumentSource(parentFields, field, selectBestTypeDescriptor(field.getType())), val ); - } - if( field.isAnnotationPresent(ArgumentCollection.class) ) { - Object val = obj != null ? JVMUtils.getFieldValue(field, obj) : null; - Field[] newParentFields = Arrays.copyOf(parentFields, parentFields.length + 1); - newParentFields[parentFields.length] = field; - bindings.putAll( extractArgumentBindings(val, field.getType(), newParentFields) ); - } - } - - sourceClass = sourceClass.getSuperclass(); - } - - return bindings; - } - - /** - * Determines whether a token looks like the name of an argument. - * @param token Token to inspect. Can be surrounded by whitespace. - * @return True if token is of short name form. - */ - private boolean isArgumentForm( String token ) { - for( ParsingMethod parsingMethod: parsingMethods ) { - if( parsingMethod.matches(token) ) - return true; - } - - return false; - } - - /** - * Parse a short name into an ArgumentMatch. - * @param token The token to parse. The token should pass the isLongArgumentForm test. - * @param position The position of the token in question. - * @return ArgumentMatch associated with this token, or null if no match exists. - */ - private ArgumentMatch parseArgument( String token, ArgumentMatchSite position ) { - if( !isArgumentForm(token) ) - throw new IllegalArgumentException( "Token is not recognizable as an argument: " + token ); - - for( ParsingMethod parsingMethod: parsingMethods ) { - if( parsingMethod.matches( token ) ) - return parsingMethod.match( argumentDefinitions, token, position ); - } - - // No parse results found. - return null; - } -} - -/** - * An exception indicating that some required arguments are missing. - */ -class MissingArgumentException extends ArgumentException { - public MissingArgumentException( Collection missingArguments ) { - super( formatArguments(missingArguments) ); - } - - private static String formatArguments( Collection missingArguments ) { - StringBuilder sb = new StringBuilder(); - for( ArgumentDefinition missingArgument: missingArguments ) { - if( missingArgument.shortName != null ) - sb.append( String.format("%nArgument with name '--%s' (-%s) is missing.", missingArgument.fullName, missingArgument.shortName) ); - else - sb.append( String.format("%nArgument with name '--%s' is missing.", missingArgument.fullName) ); - } - return sb.toString(); - } -} - -/** - * An exception for undefined arguments. - */ -class InvalidArgumentException extends ArgumentException { - public InvalidArgumentException( ArgumentMatches invalidArguments ) { - super( formatArguments(invalidArguments) ); - } - - private static String formatArguments( ArgumentMatches invalidArguments ) { - StringBuilder sb = new StringBuilder(); - for( ArgumentMatch invalidArgument: invalidArguments ) - sb.append( String.format("%nArgument with name '%s' isn't defined.", invalidArgument.label) ); - return sb.toString(); - } -} - -/** - * An exception for values whose format is invalid. - */ -class InvalidArgumentValueException extends ArgumentException { - public InvalidArgumentValueException( Collection> invalidArgumentValues ) { - super( formatArguments(invalidArgumentValues) ); - } - - private static String formatArguments( Collection> invalidArgumentValues ) { - StringBuilder sb = new StringBuilder(); - for( Pair invalidValue: invalidArgumentValues ) { - if(invalidValue.getSecond() == null) - sb.append( String.format("%nArgument '--%s' requires a value but none was provided", - invalidValue.first.fullName) ); - else - sb.append( String.format("%nArgument '--%s' has value of incorrect format: %s (should match %s)", - invalidValue.first.fullName, - invalidValue.second, - invalidValue.first.validation) ); - } - return sb.toString(); - } -} - -class ArgumentValueOutOfRangeException extends ArgumentException { - public ArgumentValueOutOfRangeException( final String argumentName, final double argumentActualValue, - final double argumentBoundaryValue, final String argumentBoundaryType ) { - super(String.format("Argument --%s has value %.2f, but %s allowed value is %.2f", - argumentName, argumentActualValue, argumentBoundaryType, argumentBoundaryValue)); - } -} - -/** - * An exception for values that can't be mated with any argument. - */ -class UnmatchedArgumentException extends ArgumentException { - public UnmatchedArgumentException( ArgumentMatch invalidValues ) { - super( formatArguments(invalidValues) ); - } - - private static String formatArguments( ArgumentMatch invalidValues ) { - StringBuilder sb = new StringBuilder(); - for( ArgumentMatchSite site: invalidValues.sites.keySet() ) - for( ArgumentMatchValue value: invalidValues.sites.get(site) ) { - switch (site.getSource().getType()) { - case CommandLine: - sb.append( String.format("%nInvalid argument value '%s' at position %d.", - value.asString(), site.getIndex()) ); - break; - case Provider: - sb.append( String.format("%nInvalid argument value '%s' in %s at position %d.", - value.asString(), site.getSource().getDescription(), site.getIndex()) ); - break; - default: - throw new RuntimeException( String.format("Unexpected argument match source type: %s", - site.getSource().getType())); - } - if(value.asString() != null && Utils.dupString(' ',value.asString().length()).equals(value.asString())) - sb.append(" Please make sure any line continuation backslashes on your command line are not followed by whitespace."); - } - return sb.toString(); - } -} - -/** - * An exception indicating that too many values have been provided for the given argument. - */ -class TooManyValuesForArgumentException extends ArgumentException { - public TooManyValuesForArgumentException( Collection arguments ) { - super( formatArguments(arguments) ); - } - - private static String formatArguments( Collection arguments ) { - StringBuilder sb = new StringBuilder(); - for( ArgumentMatch argument: arguments ) - sb.append( String.format("%nArgument '%s' has too many values: %s.", argument.label, Arrays.deepToString(argument.values().toArray())) ); - return sb.toString(); - } -} - -/** - * An exception indicating that mutually exclusive options have been passed in the same command line. - */ -class ArgumentsAreMutuallyExclusiveException extends ArgumentException { - public ArgumentsAreMutuallyExclusiveException( Collection> arguments ) { - super( formatArguments(arguments) ); - } - - private static String formatArguments( Collection> arguments ) { - StringBuilder sb = new StringBuilder(); - for( Pair argument: arguments ) - sb.append( String.format("%nArguments '%s' and '%s' are mutually exclusive.", argument.first.definition.fullName, argument.second.definition.fullName ) ); - return sb.toString(); - } - -} - - -/** - * An exception for when an argument doesn't match an of the enumerated options for that var type - */ -class UnknownEnumeratedValueException extends ArgumentException { - public UnknownEnumeratedValueException(ArgumentDefinition definition, String argumentPassed) { - super( formatArguments(definition,argumentPassed) ); - } - - private static String formatArguments(ArgumentDefinition definition, String argumentPassed) { - return String.format("Invalid value %s specified for argument %s; valid options are (%s).", argumentPassed, definition.fullName, Utils.join(",",definition.validOptions)); - } -} diff --git a/public/java/src/org/broadinstitute/sting/commandline/RodBinding.java b/public/java/src/org/broadinstitute/sting/commandline/RodBinding.java deleted file mode 100644 index ef8e01df4..000000000 --- a/public/java/src/org/broadinstitute/sting/commandline/RodBinding.java +++ /dev/null @@ -1,197 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.commandline; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.broad.tribble.Feature; - -import java.util.*; - -/** - * A RodBinding representing a walker argument that gets bound to a ROD track. - * - * The RodBinding is a formal GATK argument that bridges between a walker and - * the RefMetaDataTracker to obtain data about this rod track at runtime. The RodBinding - * is explicitly typed with type of the Tribble.Feature expected to be produced by this - * argument. The GATK Engine takes care of initializing the binding and connecting it - * to the RMD system. - * - * It is recommended that optional RodBindings be initialized to the value returned - * by the static method makeUnbound(). - * - * Note that this class is immutable. - */ -public final class RodBinding { - protected final static String UNBOUND_VARIABLE_NAME = ""; - protected final static String UNBOUND_SOURCE = "UNBOUND"; - protected final static String UNBOUND_TRIBBLE_TYPE = ""; - - /** - * Create an unbound Rodbinding of type. This is the correct programming - * style for an optional RodBinding - * - * At Input() - * RodBinding x = RodBinding.makeUnbound(T.class) - * - * The unbound binding is guaranteed to never match any binding. It uniquely - * returns false to isBound(). - * - * @param type the Class type produced by this unbound object - * @param any class extending Tribble Feature - * @return the UNBOUND RodBinding producing objects of type T - */ - @Requires("type != null") - protected final static RodBinding makeUnbound(Class type) { - return new RodBinding(type); - } - - /** The name of this binding. Often the name of the field itself, but can be overridden on cmdline */ - final private String name; - /** where the data for this ROD is coming from. A file or special value if coming from stdin */ - final private String source; - /** the string name of the tribble type, such as vcf, bed, etc. */ - final private String tribbleType; - /** The command line tags associated with this RodBinding */ - final private Tags tags; - /** The Java class expected for this RodBinding. Must correspond to the type emited by Tribble */ - final private Class type; - /** True for all RodBindings except the special UNBOUND binding, which is the default for optional arguments */ - final private boolean bound; - - /** - * The name counter. This is how we create unique names for collections of RodBindings - * on the command line. If you have provide the GATK with -X file1 and -X file2 to a - * RodBinding argument as List> then each binding will receive automatically - * the name of X and X2. - */ - final private static Map nameCounter = new HashMap(); - - /** for UnitTests */ - final public static void resetNameCounter() { - nameCounter.clear(); - } - - @Requires("rawName != null") - @Ensures("result != null") - final private static synchronized String countedVariableName(final String rawName) { - Integer count = nameCounter.get(rawName); - if ( count == null ) { - nameCounter.put(rawName, 1); - return rawName; - } else { - nameCounter.put(rawName, count + 1); - return rawName + (count + 1); - } - } - - @Requires({"type != null", "rawName != null", "source != null", "tribbleType != null", "tags != null"}) - public RodBinding(Class type, final String rawName, final String source, final String tribbleType, final Tags tags) { - this.type = type; - this.name = countedVariableName(rawName); - this.source = source; - this.tribbleType = tribbleType; - this.tags = tags; - this.bound = true; - } - - /** - * For testing purposes only. Creates a RodBinding sufficient for looking up associations to rawName - * @param type - * @param rawName - */ - public RodBinding(Class type, final String rawName) { - this(type, rawName, "missing", type.getSimpleName(), new Tags()); - } - - /** - * Make an unbound RodBinding. Only available for creating the globally unique UNBOUND object - * @param type class this unbound RodBinding creates - */ - @Requires({"type != null"}) - private RodBinding(Class type) { - this.type = type; - this.name = UNBOUND_VARIABLE_NAME; // special value can never be found in RefMetaDataTracker - this.source = UNBOUND_SOURCE; - this.tribbleType = UNBOUND_TRIBBLE_TYPE; - this.tags = new Tags(); - this.bound = false; - } - - - /** - * @return True for all RodBindings except the special UNBOUND binding, which is the default for optional arguments - */ - final public boolean isBound() { - return bound; - } - - /** - * @return The name of this binding. Often the name of the field itself, but can be overridden on cmdline - */ - @Ensures({"result != null"}) - final public String getName() { - return name; - } - - /** - * @return the string name of the tribble type, such as vcf, bed, etc. - */ - @Ensures({"result != null"}) - final public Class getType() { - return type; - } - - /** - * @return where the data for this ROD is coming from. A file or special value if coming from stdin - */ - @Ensures({"result != null"}) - final public String getSource() { - return source; - } - - /** - * @return The command line tags associated with this RodBinding. Will include the tags used to - * determine the name and type of this RodBinding - */ - @Ensures({"result != null"}) - final public Tags getTags() { - return tags; - } - - /** - * @return The Java class expected for this RodBinding. Must correspond to the type emited by Tribble - */ - @Ensures({"result != null"}) - final public String getTribbleType() { - return tribbleType; - } - - @Override - public String toString() { - return String.format("(RodBinding name=%s source=%s)", getName(), getSource()); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java deleted file mode 100644 index 111786e63..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java +++ /dev/null @@ -1,221 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.commandline.*; -import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; -import org.broadinstitute.sting.gatk.filters.ReadFilter; -import org.broadinstitute.sting.gatk.io.stubs.OutputStreamArgumentTypeDescriptor; -import org.broadinstitute.sting.gatk.io.stubs.SAMFileWriterArgumentTypeDescriptor; -import org.broadinstitute.sting.gatk.io.stubs.VCFWriterArgumentTypeDescriptor; -import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; -import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; -import org.broadinstitute.sting.gatk.walkers.Walker; -import org.broadinstitute.sting.utils.crypt.CryptUtils; -import org.broadinstitute.sting.utils.crypt.GATKKey; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.text.ListFileUtils; - -import java.security.PublicKey; -import java.util.*; - -/** - * @author aaron - */ -public abstract class CommandLineExecutable extends CommandLineProgram { - /** - * The actual engine which performs the analysis. - */ - protected GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); - - // get the analysis name - public abstract String getAnalysisName(); - - /** - * Gets the GATK argument bundle. - * @return A structure consisting of whatever arguments should be used to initialize the GATK engine. - */ - protected abstract GATKArgumentCollection getArgumentCollection(); - - /** - * A list of all the arguments initially used as sources. - */ - private final Collection argumentSources = new ArrayList(); - - protected static Logger logger = Logger.getLogger(CommandLineExecutable.class); - - /** - * this is the function that the inheriting class can expect to have called - * when the command line system has initialized. - * - * @return the return code to exit the program with - */ - protected int execute() throws Exception { - engine.setParser(parser); - argumentSources.add(this); - - Walker walker = engine.getWalkerByName(getAnalysisName()); - - try { - // Make sure a valid GATK user key is present, if required. - authorizeGATKRun(); - - engine.setArguments(getArgumentCollection()); - - // File lists can require a bit of additional expansion. Set these explicitly by the engine. - engine.setSAMFileIDs(ListFileUtils.unpackBAMFileList(getArgumentCollection().samFiles,parser)); - - engine.setWalker(walker); - walker.setToolkit(engine); - - Collection filters = engine.createFilters(); - engine.setFilters(filters); - - // load the arguments into the walker / filters. - // TODO: The fact that this extra load call exists here when all the parsing happens at the engine - // TODO: level indicates that we're doing something wrong. Turn this around so that the GATK can drive - // TODO: argument processing. - loadArgumentsIntoObject(walker); - argumentSources.add(walker); - - Collection rodBindings = ListFileUtils.unpackRODBindings(parser.getRodBindings(), parser); - engine.setReferenceMetaDataFiles(rodBindings); - - for (ReadFilter filter: filters) { - loadArgumentsIntoObject(filter); - argumentSources.add(filter); - } - - engine.execute(); - generateGATKRunReport(walker); - } catch ( Exception e ) { - generateGATKRunReport(walker, e); - throw e; - } - - // always return 0 - return 0; - } - - /** - * Authorizes this run of the GATK by checking for a valid GATK user key, if required. - * Currently, a key is required only if running with the -et NO_ET or -et STDOUT options. - */ - private void authorizeGATKRun() { - if ( getArgumentCollection().phoneHomeType == GATKRunReport.PhoneHomeOption.NO_ET || - getArgumentCollection().phoneHomeType == GATKRunReport.PhoneHomeOption.STDOUT ) { - if ( getArgumentCollection().gatkKeyFile == null ) { - throw new UserException("Running with the -et NO_ET or -et STDOUT option requires a GATK Key file. " + - "Please see " + UserException.PHONE_HOME_DOCS_URL + - " for more information and instructions on how to obtain a key."); - } - else { - PublicKey gatkPublicKey = CryptUtils.loadGATKDistributedPublicKey(); - GATKKey gatkUserKey = new GATKKey(gatkPublicKey, getArgumentCollection().gatkKeyFile); - - if ( ! gatkUserKey.isValid() ) { - throw new UserException.KeySignatureVerificationException(getArgumentCollection().gatkKeyFile); - } - } - } - } - - /** - * Generate the GATK run report for this walker using the current GATKEngine, if -et is enabled. - * This report will be written to either STDOUT or to the run repository, depending on the options - * for -et. - * - * @param e the exception, can be null if no exception occurred - */ - private void generateGATKRunReport(Walker walker, Exception e) { - if ( getArgumentCollection().phoneHomeType != GATKRunReport.PhoneHomeOption.NO_ET ) { - GATKRunReport report = new GATKRunReport(walker, e, engine, getArgumentCollection().phoneHomeType ); - report.postReport(getArgumentCollection().phoneHomeType); - } - } - - /** - * Convenience method for fully parameterized generateGATKRunReport when an exception has - * not occurred - * - * @param walker - */ - private void generateGATKRunReport(Walker walker) { - generateGATKRunReport(walker, null); - } - - /** - * Subclasses of CommandLinePrograms can provide their own types of command-line arguments. - * @return A collection of type descriptors generating implementation-dependent placeholders. - */ - protected Collection getArgumentTypeDescriptors() { - return Arrays.asList( new VCFWriterArgumentTypeDescriptor(engine,System.out,argumentSources), - new SAMFileWriterArgumentTypeDescriptor(engine,System.out), - new OutputStreamArgumentTypeDescriptor(engine,System.out) ); - } - - /** - * GATK can add arguments dynamically based on analysis type. - * - * @return true - */ - @Override - protected boolean canAddArgumentsDynamically() { - return true; - } - - /** - * GATK provides the walker as an argument source. - * @return List of walkers to load dynamically. - */ - @Override - protected Class[] getArgumentSources() { - // No walker info? No plugins. - if (getAnalysisName() == null) return new Class[] {}; - - Collection argumentSources = new ArrayList(); - - Walker walker = engine.getWalkerByName(getAnalysisName()); - engine.setArguments(getArgumentCollection()); - engine.setWalker(walker); - walker.setToolkit(engine); - argumentSources.add(walker.getClass()); - - Collection filters = engine.createFilters(); - for(ReadFilter filter: filters) - argumentSources.add(filter.getClass()); - - Class[] argumentSourcesAsArray = new Class[argumentSources.size()]; - return argumentSources.toArray(argumentSourcesAsArray); - } - - @Override - protected String getArgumentSourceName( Class argumentSource ) { - return engine.getWalkerName((Class)argumentSource); - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java deleted file mode 100644 index 5fc0ccd3e..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java +++ /dev/null @@ -1,369 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk; - -import net.sf.picard.PicardException; -import net.sf.samtools.SAMException; -import org.broad.tribble.TribbleException; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.ArgumentCollection; -import org.broadinstitute.sting.commandline.CommandLineProgram; -import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; -import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager; -import org.broadinstitute.sting.gatk.walkers.Attribution; -import org.broadinstitute.sting.gatk.walkers.Walker; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.help.*; -import org.broadinstitute.sting.utils.text.TextFormattingUtils; - -import java.util.*; - -/** - * All command line parameters accepted by all tools in the GATK. - * - * The GATK engine itself. Manages map/reduce data access and runs walkers. - * - * We run command line GATK programs using this class. It gets the command line args, parses them, and hands the - * gatk all the parsed out information. Pretty much anything dealing with the underlying system should go here, - * the gatk engine should deal with any data related information. - */ -@DocumentedGATKFeature(groupName = HelpConstants.DOCS_CAT_ENGINE) -public class CommandLineGATK extends CommandLineExecutable { - @Argument(fullName = "analysis_type", shortName = "T", doc = "Type of analysis to run") - private String analysisName = null; - - // our argument collection, the collection of command line args we accept - @ArgumentCollection - private GATKArgumentCollection argCollection = new GATKArgumentCollection(); - - /** - * Get pleasing info about the GATK. - * - * @return A list of Strings that contain pleasant info about the GATK. - */ - @Override - protected ApplicationDetails getApplicationDetails() { - return new ApplicationDetails(createApplicationHeader(), - getAttribution(), - ApplicationDetails.createDefaultRunningInstructions(getClass()), - getAdditionalHelp()); - } - - @Override - public String getAnalysisName() { - return analysisName; - } - - @Override - protected GATKArgumentCollection getArgumentCollection() { - return argCollection; - } - - /** - * Required main method implementation. - */ - public static void main(String[] argv) { - try { - CommandLineGATK instance = new CommandLineGATK(); - start(instance, argv); - System.exit(CommandLineProgram.result); // todo -- this is a painful hack - } catch (UserException e) { - exitSystemWithUserError(e); - } catch (TribbleException e) { - // We can generate Tribble Exceptions in weird places when e.g. VCF genotype fields are - // lazy loaded, so they aren't caught elsewhere and made into User Exceptions - exitSystemWithUserError(e); - } catch(PicardException e) { - // TODO: Should Picard exceptions be, in general, UserExceptions or ReviewedStingExceptions? - exitSystemWithError(e); - } catch (SAMException e) { - checkForMaskedUserErrors(e); - exitSystemWithSamError(e); - } catch (OutOfMemoryError e) { - exitSystemWithUserError(new UserException.NotEnoughMemory()); - } catch (Throwable t) { - checkForMaskedUserErrors(t); - exitSystemWithError(t); - } - } - - public static final String PICARD_TEXT_SAM_FILE_ERROR_1 = "Cannot use index file with textual SAM file"; - public static final String PICARD_TEXT_SAM_FILE_ERROR_2 = "Cannot retrieve file pointers within SAM text files"; - public static final String NO_SPACE_LEFT_ON_DEVICE_ERROR = "No space left on device"; - public static final String DISK_QUOTA_EXCEEDED_ERROR = "Disk quota exceeded"; - - private static void checkForMaskedUserErrors(final Throwable t) { - // masked out of memory error - if ( t instanceof OutOfMemoryError ) - exitSystemWithUserError(new UserException.NotEnoughMemory()); - // masked user error - if ( t instanceof UserException || t instanceof TribbleException ) - exitSystemWithUserError(new UserException(t.getMessage())); - - // no message means no masked error - final String message = t.getMessage(); - if ( message == null ) - return; - - // too many open files error - if ( message.contains("Too many open files") ) - exitSystemWithUserError(new UserException.TooManyOpenFiles()); - - // malformed BAM looks like a SAM file - if ( message.contains(PICARD_TEXT_SAM_FILE_ERROR_1) || message.contains(PICARD_TEXT_SAM_FILE_ERROR_2) ) - exitSystemWithSamError(t); - - // can't close tribble index when writing - if ( message.contains("Unable to close index for") ) - exitSystemWithUserError(new UserException(t.getCause() == null ? message : t.getCause().getMessage())); - - // disk is full - if ( message.contains(NO_SPACE_LEFT_ON_DEVICE_ERROR) || message.contains(DISK_QUOTA_EXCEEDED_ERROR) ) - exitSystemWithUserError(new UserException.NoSpaceOnDevice()); - - // masked error wrapped in another one - if ( t.getCause() != null ) - checkForMaskedUserErrors(t.getCause()); - } - - /** - * Creates the a short blurb about the GATK, copyright info, and where to get documentation. - * - * @return The application header. - */ - public static List createApplicationHeader() { - List header = new ArrayList(); - header.add(String.format("The Genome Analysis Toolkit (GATK) v%s, Compiled %s",getVersionNumber(), getBuildTime())); - header.add("Copyright (c) 2010 The Broad Institute"); - header.add("For support and documentation go to " + HelpConstants.BASE_GATK_URL); - return header; - } - - public static String getVersionNumber() { - ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("StingText"); - return headerInfo.containsKey("org.broadinstitute.sting.gatk.version") ? headerInfo.getString("org.broadinstitute.sting.gatk.version") : ""; - } - - public static String getBuildTime() { - ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("StingText"); - return headerInfo.containsKey("build.timestamp") ? headerInfo.getString("build.timestamp") : ""; - } - - /** - * If the user supplied any additional attribution, return it here. - * @return Additional attribution if supplied by the user. Empty (non-null) list otherwise. - */ - private List getAttribution() { - List attributionLines = new ArrayList(); - - // If no analysis name is present, fill in extra help on the walkers. - WalkerManager walkerManager = engine.getWalkerManager(); - String analysisName = getAnalysisName(); - if(analysisName != null && walkerManager.exists(analysisName)) { - Class walkerType = walkerManager.getWalkerClassByName(analysisName); - if(walkerType.isAnnotationPresent(Attribution.class)) - attributionLines.addAll(Arrays.asList(walkerType.getAnnotation(Attribution.class).value())); - } - return attributionLines; - } - - /** - * Retrieves additional information about GATK walkers. - * the code in HelpFormatter and supply it as a helper to this method. - * - * @return A string summarizing the walkers available in this distribution. - */ - private String getAdditionalHelp() { - String additionalHelp; - - // If no analysis name is present, fill in extra help on the walkers. - WalkerManager walkerManager = engine.getWalkerManager(); - String analysisName = getAnalysisName(); - if(analysisName != null && walkerManager.exists(getAnalysisName())) - additionalHelp = getWalkerHelp(walkerManager.getWalkerClassByName(getAnalysisName())); - else - additionalHelp = getAllWalkerHelp(); - - return additionalHelp; - } - - private static final int PACKAGE_INDENT = 1; - private static final int WALKER_INDENT = 3; - private static final String FIELD_SEPARATOR = " "; - - private String getWalkerHelp(Class walkerType) { - // Construct a help string to output details on this walker. - StringBuilder additionalHelp = new StringBuilder(); - Formatter formatter = new Formatter(additionalHelp); - - formatter.format("Available Reference Ordered Data types:%n"); - formatter.format(new FeatureManager().userFriendlyListOfAvailableFeatures()); - formatter.format("%n"); - - formatter.format("For a full description of this walker, see its GATKdocs at:%n"); - formatter.format("%s%n", GATKDocUtils.helpLinksToGATKDocs(walkerType)); - - return additionalHelp.toString(); - } - - /** - * Load in additional help information about all available walkers. - * @return A string representation of the additional help. - */ - private String getAllWalkerHelp() { - // Construct a help string to output available walkers. - StringBuilder additionalHelp = new StringBuilder(); - Formatter formatter = new Formatter(additionalHelp); - - // Get the list of walker names from the walker manager. - WalkerManager walkerManager = engine.getWalkerManager(); - - // Build a list sorted by walker display name. As this information is collected, keep track of the longest - // package / walker name for later formatting. - SortedSet helpText = new TreeSet(new HelpEntryComparator()); - - int longestPackageName = 0; - int longestWalkerName = 0; - for(Map.Entry>> walkersByPackage: walkerManager.getWalkerNamesByPackage(true).entrySet()) { - // Get the display name. - String packageName = walkersByPackage.getKey(); - String packageDisplayName = walkerManager.getPackageDisplayName(walkersByPackage.getKey()); - String packageHelpText = walkerManager.getPackageSummaryText(packageName); - - // Compute statistics about which names is longest. - longestPackageName = Math.max(longestPackageName,packageDisplayName.length()); - - SortedSet walkersInPackage = new TreeSet(new HelpEntryComparator()); - for(Class walkerType: walkersByPackage.getValue()) { - String walkerName = walkerType.getName(); - String walkerDisplayName = walkerManager.getName(walkerType); - String walkerHelpText = walkerManager.getWalkerSummaryText(walkerType); - - longestWalkerName = Math.max(longestWalkerName,walkerManager.getName(walkerType).length()); - - walkersInPackage.add(new HelpEntry(walkerName,walkerDisplayName,walkerHelpText)); - } - - // Dump the walkers into the sorted set. - helpText.add(new HelpEntry(packageName,packageDisplayName,packageHelpText,Collections.unmodifiableSortedSet(walkersInPackage))); - } - - final int headerWidth = Math.max(longestPackageName+PACKAGE_INDENT,longestWalkerName+WALKER_INDENT); - - - for(HelpEntry packageHelp: helpText) { - printDescriptorLine(formatter,PACKAGE_INDENT,packageHelp.displayName,headerWidth,FIELD_SEPARATOR,packageHelp.summary,TextFormattingUtils.DEFAULT_LINE_WIDTH); - - for(HelpEntry walkerHelp: packageHelp.children) - printDescriptorLine(formatter,WALKER_INDENT,walkerHelp.displayName,headerWidth,FIELD_SEPARATOR,walkerHelp.summary,TextFormattingUtils.DEFAULT_LINE_WIDTH); - - // Print a blank line between sets of walkers. - printDescriptorLine(formatter,0,"",headerWidth,FIELD_SEPARATOR,"", TextFormattingUtils.DEFAULT_LINE_WIDTH); - } - - return additionalHelp.toString(); - } - - private void printDescriptorLine(Formatter formatter, - int headerIndentWidth, - String header, - int headerWidth, - String fieldSeparator, - String description, - int lineWidth) { - final int headerPaddingWidth = headerWidth - header.length() - headerIndentWidth; - final int descriptionWidth = lineWidth - fieldSeparator.length() - headerWidth; - List wordWrappedText = TextFormattingUtils.wordWrap(description,descriptionWidth); - - String headerIndentFormatString = headerIndentWidth > 0 ? "%" + headerIndentWidth + "s" : "%s"; - String headerPaddingFormatString = headerPaddingWidth > 0 ? "%" + headerPaddingWidth + "s" : "%s"; - String headerWidthFormatString = headerWidth > 0 ? "%" + headerWidth + "s" : "%s"; - - // Output description line. - formatter.format(headerIndentFormatString + "%s" + headerPaddingFormatString + "%s%s%n", - "", header, "", fieldSeparator, wordWrappedText.size()>0?wordWrappedText.get(0):""); - for(int i = 1; i < wordWrappedText.size(); i++) - formatter.format(headerWidthFormatString + "%s%s%n", "", fieldSeparator, wordWrappedText.get(i)); - } - -} - -/** - * Represents a given help entry; contains a display name, a summary and optionally some children. - */ -class HelpEntry { - public final String uid; - public final String displayName; - public final String summary; - public final SortedSet children; - - /** - * Create a new help entry with the given display name, summary and children. - * @param uid a unique identifier. Usually, the java package. - * @param displayName display name for this help entry. - * @param summary summary for this help entry. - * @param children children for this help entry. - */ - public HelpEntry(String uid, String displayName, String summary, SortedSet children) { - this.uid = uid; - this.displayName = displayName; - this.summary = summary; - this.children = children; - } - - /** - * Create a new help entry with the given display name, summary and children. - * @param uid a unique identifier. Usually, the java package. - * @param displayName display name for this help entry. - * @param summary summary for this help entry. - */ - public HelpEntry(String uid, String displayName, String summary) { - this(uid,displayName,summary,null); - } - -} - -/** - * Compare two help entries by display name. - */ -class HelpEntryComparator implements Comparator { - private static TextFormattingUtils.CaseInsensitiveComparator textComparator = new TextFormattingUtils.CaseInsensitiveComparator(); - - /** - * Compares the order of lhs to rhs, not taking case into account. - * @param lhs First object to compare. - * @param rhs Second object to compare. - * @return 0 if objects are identical; -1 if lhs is before rhs, 1 if rhs is before lhs. Nulls are treated as after everything else. - */ - public int compare(HelpEntry lhs, HelpEntry rhs) { - if(lhs == null && rhs == null) return 0; - if(lhs == null || lhs.displayName.equals("")) return 1; - if(rhs == null || rhs.displayName.equals("")) return -1; - return lhs.displayName.equals(rhs.displayName) ? textComparator.compare(lhs.uid,rhs.uid) : textComparator.compare(lhs.displayName,rhs.displayName); - } - - -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java deleted file mode 100644 index 27b030060..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ /dev/null @@ -1,1232 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk; - -import com.google.java.contract.Ensures; -import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.picard.reference.ReferenceSequenceFile; -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMSequenceDictionary; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.commandline.*; -import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; -import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; -import org.broadinstitute.sting.gatk.datasources.reads.*; -import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; -import org.broadinstitute.sting.gatk.executive.MicroScheduler; -import org.broadinstitute.sting.gatk.filters.FilterManager; -import org.broadinstitute.sting.gatk.filters.ReadFilter; -import org.broadinstitute.sting.gatk.filters.ReadGroupBlackListFilter; -import org.broadinstitute.sting.gatk.io.OutputTracker; -import org.broadinstitute.sting.gatk.io.stubs.Stub; -import org.broadinstitute.sting.gatk.iterators.ReadTransformer; -import org.broadinstitute.sting.gatk.iterators.ReadTransformersMode; -import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; -import org.broadinstitute.sting.gatk.refdata.tracks.IndexDictionaryUtils; -import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; -import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; -import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; -import org.broadinstitute.sting.gatk.samples.SampleDB; -import org.broadinstitute.sting.gatk.samples.SampleDBBuilder; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.classloader.PluginManager; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.interval.IntervalUtils; -import org.broadinstitute.sting.utils.progressmeter.ProgressMeter; -import org.broadinstitute.sting.utils.recalibration.BQSRArgumentSet; -import org.broadinstitute.sting.utils.text.XReadLines; -import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.*; -import java.util.concurrent.TimeUnit; - -import static org.broadinstitute.sting.utils.DeprecatedToolChecks.getWalkerDeprecationInfo; -import static org.broadinstitute.sting.utils.DeprecatedToolChecks.isDeprecatedWalker; - -/** - * A GenomeAnalysisEngine that runs a specified walker. - */ -public class GenomeAnalysisEngine { - /** - * our log, which we want to capture anything from this class - */ - private static Logger logger = Logger.getLogger(GenomeAnalysisEngine.class); - public static final long NO_RUNTIME_LIMIT = -1; - - /** - * The GATK command-line argument parsing code. - */ - private ParsingEngine parsingEngine; - - /** - * The genomeLocParser can create and parse GenomeLocs. - */ - private GenomeLocParser genomeLocParser; - - /** - * Accessor for sharded read data. - */ - private SAMDataSource readsDataSource = null; - - /** - * Accessor for sharded reference data. - */ - private ReferenceDataSource referenceDataSource = null; - - /** - * Accessor for sample metadata - */ - private SampleDB sampleDB = null; - - /** - * Accessor for sharded reference-ordered data. - */ - private List rodDataSources; - - // our argument collection - private GATKArgumentCollection argCollection; - - /** - * Collection of intervals used by the engine. - */ - private GenomeLocSortedSet intervals = null; - - /** - * Explicitly assign the interval set to use for this traversal (for unit testing purposes) - * @param intervals set of intervals to use for this traversal - */ - public void setIntervals( GenomeLocSortedSet intervals ) { - this.intervals = intervals; - } - - /** - * Collection of inputs used by the engine. - */ - private Map inputs = new HashMap(); - - /** - * Collection of outputs used by the engine. - */ - private Collection> outputs = new ArrayList>(); - - /** - * Collection of the filters applied to the input data. - */ - private Collection filters; - - /** - * Collection of the read transformers applied to the reads - */ - private List readTransformers; - - /** - * Controls the allocation of threads between CPU vs IO. - */ - private ThreadAllocation threadAllocation; - - private ReadMetrics cumulativeMetrics = null; - - /** - * A currently hacky unique name for this GATK instance - */ - private String myName = "GATK_" + Math.abs(getRandomGenerator().nextInt()); - - /** - * our walker manager - */ - private final WalkerManager walkerManager = new WalkerManager(); - - private Walker walker; - - public void setWalker(Walker walker) { - this.walker = walker; - } - - /** - * The short name of the current GATK walker as a string - * @return a non-null String - */ - public String getWalkerName() { - return getWalkerName(walker.getClass()); - } - - /** - * A processed collection of SAM reader identifiers. - */ - private Collection samReaderIDs = Collections.emptyList(); - - /** - * Set the SAM/BAM files over which to traverse. - * @param samReaderIDs Collection of ids to use during this traversal. - */ - public void setSAMFileIDs(Collection samReaderIDs) { - this.samReaderIDs = samReaderIDs; - } - - /** - * Collection of reference metadata files over which to traverse. - */ - private Collection referenceMetaDataFiles; - - /** - * The threading efficiency monitor we use in the GATK to monitor our efficiency. - * - * May be null if one isn't active, or hasn't be initialized yet - */ - private ThreadEfficiencyMonitor threadEfficiencyMonitor = null; - - /** - * The global progress meter we are using to track our progress through the genome - */ - private ProgressMeter progressMeter = null; - - /** - * Set the reference metadata files to use for this traversal. - * @param referenceMetaDataFiles Collection of files and descriptors over which to traverse. - */ - public void setReferenceMetaDataFiles(Collection referenceMetaDataFiles) { - this.referenceMetaDataFiles = referenceMetaDataFiles; - } - - /** - * The maximum runtime of this engine, in nanoseconds, set during engine initialization - * from the GATKArgumentCollection command line value - */ - private long runtimeLimitInNanoseconds = -1; - - /** - * Static random number generator and seed. - */ - private static final long GATK_RANDOM_SEED = 47382911L; - private static Random randomGenerator = new Random(GATK_RANDOM_SEED); - public static Random getRandomGenerator() { return randomGenerator; } - public static void resetRandomGenerator() { randomGenerator.setSeed(GATK_RANDOM_SEED); } - public static void resetRandomGenerator(long seed) { randomGenerator.setSeed(seed); } - - /** - * Base Quality Score Recalibration helper object - */ - private BQSRArgumentSet bqsrArgumentSet = null; - public BQSRArgumentSet getBQSRArgumentSet() { return bqsrArgumentSet; } - public boolean hasBQSRArgumentSet() { return bqsrArgumentSet != null; } - public void setBaseRecalibration(final GATKArgumentCollection args) { - bqsrArgumentSet = new BQSRArgumentSet(args); - } - - /** - * Actually run the GATK with the specified walker. - * - * @return the value of this traversal. - */ - public Object execute() { - // first thing is to make sure the AWS keys can be decrypted - GATKRunReport.checkAWSAreValid(); - - //HeapSizeMonitor monitor = new HeapSizeMonitor(); - //monitor.start(); - setStartTime(new java.util.Date()); - - final GATKArgumentCollection args = this.getArguments(); - - // validate our parameters - if (args == null) { - throw new ReviewedStingException("The GATKArgumentCollection passed to GenomeAnalysisEngine can not be null."); - } - - // validate our parameters - if (this.walker == null) - throw new ReviewedStingException("The walker passed to GenomeAnalysisEngine can not be null."); - - if (args.nonDeterministicRandomSeed) - resetRandomGenerator(System.currentTimeMillis()); - - // if the use specified an input BQSR recalibration table then enable on the fly recalibration - if (args.BQSR_RECAL_FILE != null) - setBaseRecalibration(args); - - // setup the runtime limits - setupRuntimeLimits(args); - - // Determine how the threads should be divided between CPU vs. IO. - determineThreadAllocation(); - - // Prepare the data for traversal. - initializeDataSources(); - - // initialize and validate the interval list - initializeIntervals(); - validateSuppliedIntervals(); - - // check to make sure that all sequence dictionaries are compatible with the reference's sequence dictionary - validateDataSourcesAgainstReference(readsDataSource, referenceDataSource.getReference(), rodDataSources); - - // initialize sampleDB - initializeSampleDB(); - - // our microscheduler, which is in charge of running everything - MicroScheduler microScheduler = createMicroscheduler(); - threadEfficiencyMonitor = microScheduler.getThreadEfficiencyMonitor(); - - // create temp directories as necessary - initializeTempDirectory(); - - // create the output streams - initializeOutputStreams(microScheduler.getOutputTracker()); - - // Initializing the shard iterator / BAM schedule might take some time, so let the user know vaguely what's going on - logger.info("Preparing for traversal" + - (readsDataSource.getReaderIDs().size() > 0 ? String.format(" over %d BAM files", readsDataSource.getReaderIDs().size()) : "")); - Iterable shardStrategy = getShardStrategy(readsDataSource,microScheduler.getReference(),intervals); - logger.info("Done preparing for traversal"); - - // execute the microscheduler, storing the results - return microScheduler.execute(this.walker, shardStrategy); - - //monitor.stop(); - //logger.info(String.format("Maximum heap size consumed: %d",monitor.getMaxMemoryUsed())); - - //return result; - } - - /** - * Retrieves an instance of the walker based on the walker name. - * - * @param walkerName Name of the walker. Must not be null. If the walker cannot be instantiated, an exception will be thrown. - * @return An instance of the walker. - */ - public Walker getWalkerByName(String walkerName) { - try { - return walkerManager.createByName(walkerName); - } catch ( UserException e ) { - if ( isDeprecatedWalker(walkerName) ) { - e = new UserException.DeprecatedWalker(walkerName, getWalkerDeprecationInfo(walkerName)); - } - throw e; - } - } - - /** - * Gets the name of a given walker type. - * @param walkerType Type of walker. - * @return Name of the walker. - */ - public String getWalkerName(Class walkerType) { - return walkerManager.getName(walkerType); - } - - public String getName() { - return myName; - } - - /** - * Gets a list of the filters to associate with the given walker. Will NOT initialize the engine with this filters; - * the caller must handle that directly. - * @return A collection of available filters. - */ - public Collection createFilters() { - final List filters = new LinkedList<>(); - - // First add the user requested filters - if (this.getArguments().readGroupBlackList != null && this.getArguments().readGroupBlackList.size() > 0) - filters.add(new ReadGroupBlackListFilter(this.getArguments().readGroupBlackList)); - for(final String filterName: this.getArguments().readFilters) - filters.add(this.getFilterManager().createByName(filterName)); - - // now add the walker default filters. This ordering is critical important if - // users need to apply filters that fix up reads that would be removed by default walker filters - filters.addAll(WalkerManager.getReadFilters(walker,this.getFilterManager())); - - return Collections.unmodifiableList(filters); - } - - /** - * Returns a list of active, initialized read transformers - * - * @param walker the walker we need to apply read transformers too - */ - public void initializeReadTransformers(final Walker walker) { - // keep a list of the active read transformers sorted based on priority ordering - List activeTransformers = new ArrayList(); - - final ReadTransformersMode overrideMode = WalkerManager.getWalkerAnnotation(walker, ReadTransformersMode.class); - final ReadTransformer.ApplicationTime overrideTime = overrideMode != null ? overrideMode.ApplicationTime() : null; - - final PluginManager pluginManager = new PluginManager(ReadTransformer.class); - - for ( final ReadTransformer transformer : pluginManager.createAllTypes() ) { - transformer.initialize(overrideTime, this, walker); - if ( transformer.enabled() ) - activeTransformers.add(transformer); - } - - setReadTransformers(activeTransformers); - } - - public List getReadTransformers() { - return readTransformers; - } - - /* - * Sanity checks that incompatible read transformers are not active together (and throws an exception if they are). - * - * @param readTransformers the active read transformers - */ - protected void checkActiveReadTransformers(final List readTransformers) { - if ( readTransformers == null ) - throw new IllegalArgumentException("read transformers cannot be null"); - - ReadTransformer sawMustBeFirst = null; - ReadTransformer sawMustBeLast = null; - - for ( final ReadTransformer r : readTransformers ) { - if ( r.getOrderingConstraint() == ReadTransformer.OrderingConstraint.MUST_BE_FIRST ) { - if ( sawMustBeFirst != null ) - throw new UserException.IncompatibleReadFiltersException(sawMustBeFirst.toString(), r.toString()); - sawMustBeFirst = r; - } else if ( r.getOrderingConstraint() == ReadTransformer.OrderingConstraint.MUST_BE_LAST ) { - if ( sawMustBeLast != null ) - throw new UserException.IncompatibleReadFiltersException(sawMustBeLast.toString(), r.toString()); - sawMustBeLast = r; - } - } - } - - protected void setReadTransformers(final List readTransformers) { - if ( readTransformers == null ) - throw new ReviewedStingException("read transformers cannot be null"); - - // sort them in priority order - Collections.sort(readTransformers, new ReadTransformer.ReadTransformerComparator()); - - // make sure we don't have an invalid set of active read transformers - checkActiveReadTransformers(readTransformers); - - this.readTransformers = readTransformers; - } - - /** - * Parse out the thread allocation from the given command-line argument. - */ - private void determineThreadAllocation() { - if ( argCollection.numberOfDataThreads < 1 ) throw new UserException.BadArgumentValue("num_threads", "cannot be less than 1, but saw " + argCollection.numberOfDataThreads); - if ( argCollection.numberOfCPUThreadsPerDataThread < 1 ) throw new UserException.BadArgumentValue("num_cpu_threads", "cannot be less than 1, but saw " + argCollection.numberOfCPUThreadsPerDataThread); - if ( argCollection.numberOfIOThreads < 0 ) throw new UserException.BadArgumentValue("num_io_threads", "cannot be less than 0, but saw " + argCollection.numberOfIOThreads); - - this.threadAllocation = new ThreadAllocation(argCollection.numberOfDataThreads, - argCollection.numberOfCPUThreadsPerDataThread, - argCollection.numberOfIOThreads, - argCollection.monitorThreadEfficiency); - } - - public int getTotalNumberOfThreads() { - return this.threadAllocation == null ? 1 : threadAllocation.getTotalNumThreads(); - } - - - - /** - * Allow subclasses and others within this package direct access to the walker manager. - * @return The walker manager used by this package. - */ - protected WalkerManager getWalkerManager() { - return walkerManager; - } - - /** - * setup a microscheduler - * - * @return a new microscheduler - */ - private MicroScheduler createMicroscheduler() { - // Temporarily require all walkers to have a reference, even if that reference is not conceptually necessary. - if ((walker instanceof ReadWalker || walker instanceof DuplicateWalker || walker instanceof ReadPairWalker) && - this.getArguments().referenceFile == null) { - throw new UserException.CommandLineException("Read-based traversals require a reference file but none was given"); - } - - return MicroScheduler.create(this,walker,this.getReadsDataSource(),this.getReferenceDataSource().getReference(),this.getRodDataSources(),threadAllocation); - } - - protected DownsamplingMethod getDownsamplingMethod() { - GATKArgumentCollection argCollection = this.getArguments(); - - DownsamplingMethod commandLineMethod = argCollection.getDownsamplingMethod(); - DownsamplingMethod walkerMethod = WalkerManager.getDownsamplingMethod(walker); - - DownsamplingMethod method = commandLineMethod != null ? commandLineMethod : walkerMethod; - method.checkCompatibilityWithWalker(walker); - return method; - } - - protected void setDownsamplingMethod(DownsamplingMethod method) { - argCollection.setDownsamplingMethod(method); - } - - protected boolean includeReadsWithDeletionAtLoci() { - return walker.includeReadsWithDeletionAtLoci(); - } - - /** - * Verifies that the supplied set of reads files mesh with what the walker says it requires, - * and also makes sure that there were no duplicate SAM files specified on the command line. - */ - protected void validateSuppliedReads() { - GATKArgumentCollection arguments = this.getArguments(); - // Check what the walker says is required against what was provided on the command line. - if (WalkerManager.isRequired(walker, DataSource.READS) && (arguments.samFiles == null || arguments.samFiles.size() == 0)) - throw new ArgumentException("Walker requires reads but none were provided."); - - // Check what the walker says is allowed against what was provided on the command line. - if ((arguments.samFiles != null && arguments.samFiles.size() > 0) && !WalkerManager.isAllowed(walker, DataSource.READS)) - throw new ArgumentException("Walker does not allow reads but reads were provided."); - - // Make sure no SAM files were specified multiple times by the user. - checkForDuplicateSamFiles(); - } - - /** - * Checks whether there are SAM files that appear multiple times in the fully unpacked list of - * SAM files (samReaderIDs). If there are, throws an ArgumentException listing the files in question. - */ - protected void checkForDuplicateSamFiles() { - Set encounteredSamFiles = new HashSet(); - Set duplicateSamFiles = new LinkedHashSet(); - - for ( SAMReaderID samFile : samReaderIDs ) { - if ( encounteredSamFiles.contains(samFile) ) { - duplicateSamFiles.add(samFile.getSamFilePath()); - } - else { - encounteredSamFiles.add(samFile); - } - } - - if ( duplicateSamFiles.size() > 0 ) { - throw new UserException("The following BAM files appear multiple times in the list of input files: " + - duplicateSamFiles + " BAM files may be specified at most once."); - } - } - - /** - * Verifies that the supplied reference file mesh with what the walker says it requires. - */ - protected void validateSuppliedReference() { - GATKArgumentCollection arguments = this.getArguments(); - // Check what the walker says is required against what was provided on the command line. - // TODO: Temporarily disabling WalkerManager.isRequired check on the reference because the reference is always required. - if (/*WalkerManager.isRequired(walker, DataSource.REFERENCE) &&*/ arguments.referenceFile == null) - throw new ArgumentException("Walker requires a reference but none was provided."); - - // Check what the walker says is allowed against what was provided on the command line. - if (arguments.referenceFile != null && !WalkerManager.isAllowed(walker, DataSource.REFERENCE)) - throw new ArgumentException("Walker does not allow a reference but one was provided."); - } - - protected void validateSuppliedIntervals() { - // Only read walkers support '-L unmapped' intervals. Trap and validate any other instances of -L unmapped. - if(!(walker instanceof ReadWalker)) { - GenomeLocSortedSet intervals = getIntervals(); - if(intervals != null && getIntervals().contains(GenomeLoc.UNMAPPED)) - throw new ArgumentException("Interval list specifies unmapped region. Only read walkers may include the unmapped region."); - } - - // If intervals is non-null and empty at this point, it means that the list of intervals to process - // was filtered down to an empty set (eg., the user specified something like -L chr1 -XL chr1). Since - // this was very likely unintentional, the user should be informed of this. Note that this is different - // from the case where intervals == null, which indicates that there were no interval arguments. - if ( intervals != null && intervals.isEmpty() ) { - logger.warn("The given combination of -L and -XL options results in an empty set. No intervals to process."); - } - - // TODO: add a check for ActiveRegion walkers to prevent users from passing an entire contig/chromosome - } - - /** - * Get the sharding strategy given a driving data source. - * - * @param readsDataSource readsDataSource - * @param drivingDataSource Data on which to shard. - * @param intervals intervals - * @return the sharding strategy - */ - protected Iterable getShardStrategy(SAMDataSource readsDataSource, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) { - ValidationExclusion exclusions = (readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null); - DownsamplingMethod downsamplingMethod = readsDataSource != null ? readsDataSource.getReadsInfo().getDownsamplingMethod() : null; - ReferenceDataSource referenceDataSource = this.getReferenceDataSource(); - - // If reads are present, assume that accessing the reads is always the dominant factor and shard based on that supposition. - if(!readsDataSource.isEmpty()) { - if(!readsDataSource.hasIndex() && !exclusions.contains(ValidationExclusion.TYPE.ALLOW_UNINDEXED_BAM)) - throw new UserException.CommandLineException("Cannot process the provided BAM file(s) because they were not indexed. The GATK does offer limited processing of unindexed BAMs in --unsafe mode, but this GATK feature is currently unsupported."); - if(!readsDataSource.hasIndex() && intervals != null && !argCollection.allowIntervalsWithUnindexedBAM) - throw new UserException.CommandLineException("Cannot perform interval processing when reads are present but no index is available."); - - if(walker instanceof LocusWalker) { - if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) - throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); - if(intervals == null) - return readsDataSource.createShardIteratorOverMappedReads(new LocusShardBalancer()); - else - return readsDataSource.createShardIteratorOverIntervals(intervals,new LocusShardBalancer()); - } - else if(walker instanceof ActiveRegionWalker) { - if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) - throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Active region walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); - if(intervals == null) - return readsDataSource.createShardIteratorOverMappedReads(new ActiveRegionShardBalancer()); - else - return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), new ActiveRegionShardBalancer()); - } - else if(walker instanceof ReadWalker || walker instanceof ReadPairWalker || walker instanceof DuplicateWalker) { - // Apply special validation to read pair walkers. - if(walker instanceof ReadPairWalker) { - if(readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.queryname) - throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.queryname, "Read pair walkers are exceptions in that they cannot be run on coordinate-sorted BAMs but instead require query name-sorted files. You will need to resort your input BAM file in query name order to use this walker."); - if(intervals != null && !intervals.isEmpty()) - throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals."); - } - - if(intervals == null) - return readsDataSource.createShardIteratorOverAllReads(new ReadShardBalancer()); - else - return readsDataSource.createShardIteratorOverIntervals(intervals, new ReadShardBalancer()); - } - else - throw new ReviewedStingException("Unable to determine walker type for walker " + walker.getClass().getName()); - } - else { - // TODO -- Determine what the ideal shard size should be here. Matt suggested that a multiple of 16K might work well - // TODO -- (because of how VCF indexes work), but my empirical experience has been simply that the larger the shard - // TODO -- size the more efficient the traversal (at least for RODWalkers). Keeping the previous values for now. [EB] - final int SHARD_SIZE = walker instanceof RodWalker ? 1000000 : 100000; - if(intervals == null) - return referenceDataSource.createShardsOverEntireReference(readsDataSource,genomeLocParser,SHARD_SIZE); - else - return referenceDataSource.createShardsOverIntervals(readsDataSource,intervals,SHARD_SIZE); - } - } - - protected boolean flashbackData() { - return walker instanceof ReadWalker; - } - - /** - * Create the temp directory if it doesn't exist. - */ - private void initializeTempDirectory() { - File tempDir = new File(System.getProperty("java.io.tmpdir")); - if (!tempDir.exists() && !tempDir.mkdirs()) - throw new UserException.BadTmpDir("Unable to create directory"); - } - - /** - * Initialize the output streams as specified by the user. - * - * @param outputTracker the tracker supplying the initialization data. - */ - private void initializeOutputStreams(OutputTracker outputTracker) { - for (Map.Entry input : getInputs().entrySet()) - outputTracker.addInput(input.getKey(), input.getValue()); - for (Stub stub : getOutputs()) - outputTracker.addOutput(stub); - - outputTracker.prepareWalker(walker, getArguments().strictnessLevel); - } - - public ReferenceDataSource getReferenceDataSource() { - return referenceDataSource; - } - - public GenomeLocParser getGenomeLocParser() { - return genomeLocParser; - } - - /** - * Manage lists of filters. - */ - private final FilterManager filterManager = new FilterManager(); - - private Date startTime = null; // the start time for execution - - public void setParser(ParsingEngine parsingEngine) { - this.parsingEngine = parsingEngine; - } - - /** - * Explicitly set the GenomeLocParser, for unit testing. - * @param genomeLocParser GenomeLocParser to use. - */ - public void setGenomeLocParser(GenomeLocParser genomeLocParser) { - this.genomeLocParser = genomeLocParser; - } - - /** - * Sets the start time when the execute() function was last called - * @param startTime the start time when the execute() function was last called - */ - protected void setStartTime(Date startTime) { - this.startTime = startTime; - } - - /** - * @return the start time when the execute() function was last called - */ - public Date getStartTime() { - return startTime; - } - - /** - * Setup the intervals to be processed - */ - protected void initializeIntervals() { - intervals = IntervalUtils.parseIntervalArguments(this.referenceDataSource, argCollection.intervalArguments); - } - - /** - * Add additional, externally managed IO streams for inputs. - * - * @param argumentSource Field into which to inject the value. - * @param value Instance to inject. - */ - public void addInput(ArgumentSource argumentSource, Object value) { - inputs.put(argumentSource, value); - } - - /** - * Add additional, externally managed IO streams for output. - * - * @param stub Instance to inject. - */ - public void addOutput(Stub stub) { - outputs.add(stub); - } - - /** - * Returns the tag associated with a given command-line argument. - * @param key Object for which to inspect the tag. - * @return Tags object associated with the given key, or an empty Tag structure if none are present. - */ - public Tags getTags(Object key) { - return parsingEngine.getTags(key); - } - - protected void initializeDataSources() { - logger.info("Strictness is " + argCollection.strictnessLevel); - - validateSuppliedReference(); - setReferenceDataSource(argCollection.referenceFile); - - validateSuppliedReads(); - initializeReadTransformers(walker); - - readsDataSource = createReadsDataSource(argCollection,genomeLocParser,referenceDataSource.getReference()); - - for (ReadFilter filter : filters) - filter.initialize(this); - - // set the sequence dictionary of all of Tribble tracks to the sequence dictionary of our reference - rodDataSources = getReferenceOrderedDataSources(referenceMetaDataFiles,referenceDataSource.getReference().getSequenceDictionary(),genomeLocParser,argCollection.unsafe); - } - - /** - * Purely for testing purposes. Do not use unless you absolutely positively know what you are doing (or - * need to absolutely positively kill everyone in the room) - * @param dataSource - */ - public void setReadsDataSource(final SAMDataSource dataSource) { - this.readsDataSource = dataSource; - } - - /** - * Entry-point function to initialize the samples database from input data and pedigree arguments - */ - private void initializeSampleDB() { - SampleDBBuilder sampleDBBuilder = new SampleDBBuilder(this, argCollection.pedigreeValidationType); - sampleDBBuilder.addSamplesFromSAMHeader(getSAMFileHeader()); - sampleDBBuilder.addSamplesFromSampleNames(SampleUtils.getUniqueSamplesFromRods(this)); - sampleDBBuilder.addSamplesFromPedigreeFiles(argCollection.pedigreeFiles); - sampleDBBuilder.addSamplesFromPedigreeStrings(argCollection.pedigreeStrings); - sampleDB = sampleDBBuilder.getFinalSampleDB(); - } - - /** - * Gets a unique identifier for the reader sourcing this read. - * @param read Read to examine. - * @return A unique identifier for the source file of this read. Exception if not found. - */ - public SAMReaderID getReaderIDForRead(final SAMRecord read) { - return getReadsDataSource().getReaderID(read); - } - - /** - * Gets the source file for this read. - * @param id Unique identifier determining which input file to use. - * @return The source filename for this read. - */ - public File getSourceFileForReaderID(final SAMReaderID id) { - return getReadsDataSource().getSAMFile(id); - } - - /** - * Now that all files are open, validate the sequence dictionaries of the reads vs. the reference vrs the reference ordered data (if available). - * - * @param reads Reads data source. - * @param reference Reference data source. - * @param rods a collection of the reference ordered data tracks - */ - private void validateDataSourcesAgainstReference(SAMDataSource reads, ReferenceSequenceFile reference, Collection rods) { - if ((reads.isEmpty() && (rods == null || rods.isEmpty())) || reference == null ) - return; - - // Compile a set of sequence names that exist in the reference file. - SAMSequenceDictionary referenceDictionary = reference.getSequenceDictionary(); - - if (!reads.isEmpty()) { - // Compile a set of sequence names that exist in the BAM files. - SAMSequenceDictionary readsDictionary = reads.getHeader().getSequenceDictionary(); - - if (readsDictionary.size() == 0) { - logger.info("Reads file is unmapped. Skipping validation against reference."); - return; - } - - // compare the reads to the reference - SequenceDictionaryUtils.validateDictionaries(logger, getArguments().unsafe, "reads", readsDictionary, - "reference", referenceDictionary, true, intervals); - } - - for (ReferenceOrderedDataSource rod : rods) - IndexDictionaryUtils.validateTrackSequenceDictionary(rod.getName(), rod.getSequenceDictionary(), referenceDictionary, getArguments().unsafe); - } - - /** - * Gets a data source for the given set of reads. - * - * @param argCollection arguments - * @param genomeLocParser parser - * @param refReader reader - * @return A data source for the given set of reads. - */ - private SAMDataSource createReadsDataSource(GATKArgumentCollection argCollection, GenomeLocParser genomeLocParser, IndexedFastaSequenceFile refReader) { - DownsamplingMethod downsamplingMethod = getDownsamplingMethod(); - - // Synchronize the method back into the collection so that it shows up when - // interrogating for the downsampling method during command line recreation. - setDownsamplingMethod(downsamplingMethod); - - logger.info(downsamplingMethod); - - if (argCollection.removeProgramRecords && argCollection.keepProgramRecords) - throw new UserException.BadArgumentValue("rpr / kpr", "Cannot enable both options"); - - boolean removeProgramRecords = argCollection.removeProgramRecords || walker.getClass().isAnnotationPresent(RemoveProgramRecords.class); - - if (argCollection.keepProgramRecords) - removeProgramRecords = false; - - final boolean keepReadsInLIBS = walker instanceof ActiveRegionWalker; - - final Map sampleRenameMap = argCollection.sampleRenameMappingFile != null ? - loadSampleRenameMap(argCollection.sampleRenameMappingFile) : - null; - - return new SAMDataSource( - samReaderIDs, - threadAllocation, - argCollection.numberOfBAMFileHandles, - genomeLocParser, - argCollection.useOriginalBaseQualities, - argCollection.strictnessLevel, - argCollection.readBufferSize, - downsamplingMethod, - new ValidationExclusion(Arrays.asList(argCollection.unsafe)), - filters, - readTransformers, - includeReadsWithDeletionAtLoci(), - argCollection.defaultBaseQualities, - removeProgramRecords, - keepReadsInLIBS, - sampleRenameMap); - } - - /** - * Loads a user-provided sample rename map file for use in on-the-fly sample renaming into an in-memory - * HashMap. This file must consist of lines with two whitespace-separated fields: - * - * absolute_path_to_bam_file new_sample_name - * - * The engine will verify that each bam file contains reads from only one sample when the on-the-fly sample - * renaming feature is being used. - * - * @param sampleRenameMapFile sample rename map file from which to load data - * @return a HashMap containing the contents of the map file, with the keys being the bam file paths and - * the values being the new sample names. - */ - protected Map loadSampleRenameMap( final File sampleRenameMapFile ) { - logger.info("Renaming samples from BAM files on-the-fly using mapping file " + sampleRenameMapFile.getAbsolutePath()); - - final Map sampleRenameMap = new HashMap<>((int)sampleRenameMapFile.length() / 50); - - try { - for ( final String line : new XReadLines(sampleRenameMapFile) ) { - final String[] tokens = line.split("\\s+"); - - if ( tokens.length != 2 ) { - throw new UserException.MalformedFile(sampleRenameMapFile, - String.format("Encountered a line with %s fields instead of the required 2 fields. Line was: %s", - tokens.length, line)); - } - - final File bamFile = new File(tokens[0]); - final String newSampleName = tokens[1]; - - if ( ! bamFile.isAbsolute() ) { - throw new UserException.MalformedFile(sampleRenameMapFile, "Bam file path not absolute at line: " + line); - } - - final SAMReaderID bamID = new SAMReaderID(bamFile, new Tags()); - - if ( sampleRenameMap.containsKey(bamID) ) { - throw new UserException.MalformedFile(sampleRenameMapFile, - String.format("Bam file %s appears more than once", bamFile.getAbsolutePath())); - } - - sampleRenameMap.put(bamID, newSampleName); - } - } - catch ( FileNotFoundException e ) { - throw new UserException.CouldNotReadInputFile(sampleRenameMapFile, e); - } - - return sampleRenameMap; - } - - - /** - * Opens a reference sequence file paired with an index. Only public for testing purposes - * - * @param refFile Handle to a reference sequence file. Non-null. - */ - public void setReferenceDataSource(File refFile) { - this.referenceDataSource = new ReferenceDataSource(refFile); - genomeLocParser = new GenomeLocParser(referenceDataSource.getReference()); - } - - /** - * Open the reference-ordered data sources. - * - * @param referenceMetaDataFiles collection of RMD descriptors to load and validate. - * @param sequenceDictionary GATK-wide sequnce dictionary to use for validation. - * @param genomeLocParser to use when creating and validating GenomeLocs. - * @param validationExclusionType potentially indicate which validations to include / exclude. - * - * @return A list of reference-ordered data sources. - */ - private List getReferenceOrderedDataSources(Collection referenceMetaDataFiles, - SAMSequenceDictionary sequenceDictionary, - GenomeLocParser genomeLocParser, - ValidationExclusion.TYPE validationExclusionType) { - final RMDTrackBuilder builder = new RMDTrackBuilder(sequenceDictionary,genomeLocParser, validationExclusionType, - getArguments().disableAutoIndexCreationAndLockingWhenReadingRods); - - final List dataSources = new ArrayList(); - for (RMDTriplet fileDescriptor : referenceMetaDataFiles) - dataSources.add(new ReferenceOrderedDataSource(fileDescriptor, - builder, - sequenceDictionary, - genomeLocParser, - flashbackData())); - - return dataSources; - } - - /** - * Returns the SAM File Header from the input reads' data source file - * @return the SAM File Header from the input reads' data source file - */ - public SAMFileHeader getSAMFileHeader() { - return readsDataSource.getHeader(); - } - - public boolean lenientVCFProcessing() { - return lenientVCFProcessing(argCollection.unsafe); - } - - public static boolean lenientVCFProcessing(final ValidationExclusion.TYPE val) { - return val == ValidationExclusion.TYPE.ALL - || val == ValidationExclusion.TYPE.LENIENT_VCF_PROCESSING; - } - - /** - * Returns the unmerged SAM file header for an individual reader. - * @param reader The reader. - * @return Header for that reader or null if not available. - */ - public SAMFileHeader getSAMFileHeader(SAMReaderID reader) { - return readsDataSource == null ? null : readsDataSource.getHeader(reader); - } - - /** - * Returns an ordered list of the unmerged SAM file headers known to this engine. - * @return list of header for each input SAM file, in command line order - */ - public List getSAMFileHeaders() { - final List headers = new ArrayList(); - for ( final SAMReaderID id : getReadsDataSource().getReaderIDs() ) { - headers.add(getReadsDataSource().getHeader(id)); - } - return headers; - } - - /** - * Gets the master sequence dictionary for this GATK engine instance - * @return a never-null dictionary listing all of the contigs known to this engine instance - */ - public SAMSequenceDictionary getMasterSequenceDictionary() { - return getReferenceDataSource().getReference().getSequenceDictionary(); - } - - /** - * Returns data source object encapsulating all essential info and handlers used to traverse - * reads; header merger, individual file readers etc can be accessed through the returned data source object. - * - * @return the reads data source - */ - public SAMDataSource getReadsDataSource() { - return this.readsDataSource; - } - - /** - * Sets the collection of GATK main application arguments. - * - * @param argCollection the GATK argument collection - */ - public void setArguments(GATKArgumentCollection argCollection) { - this.argCollection = argCollection; - } - - /** - * Gets the collection of GATK main application arguments. - * - * @return the GATK argument collection - */ - public GATKArgumentCollection getArguments() { - return this.argCollection; - } - - /** - * Get the list of intervals passed to the engine. - * @return List of intervals, or null if no intervals are in use - */ - public GenomeLocSortedSet getIntervals() { - return this.intervals; - } - - /** - * Get the list of regions of the genome being processed. If the user - * requested specific intervals, return those, otherwise return regions - * corresponding to the entire genome. Never returns null. - * - * @return a non-null set of intervals being processed - */ - @Ensures("result != null") - public GenomeLocSortedSet getRegionsOfGenomeBeingProcessed() { - if ( getIntervals() == null ) - // if we don't have any intervals defined, create intervals from the reference itself - return GenomeLocSortedSet.createSetFromSequenceDictionary(getReferenceDataSource().getReference().getSequenceDictionary()); - else - return getIntervals(); - } - - /** - * Gets the list of filters employed by this engine. - * @return Collection of filters (actual instances) used by this engine. - */ - public Collection getFilters() { - return this.filters; - } - - /** - * Sets the list of filters employed by this engine. - * @param filters Collection of filters (actual instances) used by this engine. - */ - public void setFilters(Collection filters) { - this.filters = filters; - } - - /** - * Gets the filter manager for this engine. - * @return filter manager for this engine. - */ - protected FilterManager getFilterManager() { - return filterManager; - } - - /** - * Gets the input sources for this engine. - * @return input sources for this engine. - */ - protected Map getInputs() { - return inputs; - } - - /** - * Gets the output stubs for this engine. - * @return output stubs for this engine. - */ - protected Collection> getOutputs() { - return outputs; - } - - /** - * Returns data source objects encapsulating all rod data; - * individual rods can be accessed through the returned data source objects. - * - * @return the rods data sources - */ - public List getRodDataSources() { - return this.rodDataSources; - } - - /** - * Gets cumulative metrics about the entire run to this point. - * Returns a clone of this snapshot in time. - * @return cumulative metrics about the entire run at this point. ReadMetrics object is a unique instance and is - * owned by the caller; the caller can do with the object what they wish. - */ - public ReadMetrics getCumulativeMetrics() { - // todo -- probably shouldn't be lazy - if ( cumulativeMetrics == null ) - cumulativeMetrics = readsDataSource == null ? new ReadMetrics() : readsDataSource.getCumulativeReadMetrics(); - return cumulativeMetrics; - } - - /** - * Return the global ThreadEfficiencyMonitor, if there is one - * - * @return the monitor, or null if none is active - */ - public ThreadEfficiencyMonitor getThreadEfficiencyMonitor() { - return threadEfficiencyMonitor; - } - - // ------------------------------------------------------------------------------------- - // - // code for working with Samples database - // - // ------------------------------------------------------------------------------------- - - public SampleDB getSampleDB() { - return this.sampleDB; - } - - public Map getApproximateCommandLineArguments(Object... argumentProviders) { - return CommandLineUtils.getApproximateCommandLineArguments(parsingEngine,argumentProviders); - } - - public String createApproximateCommandLineArgumentString(Object... argumentProviders) { - return CommandLineUtils.createApproximateCommandLineArgumentString(parsingEngine,argumentProviders); - } - - // ------------------------------------------------------------------------------------- - // - // code for working with progress meter - // - // ------------------------------------------------------------------------------------- - - /** - * Register the global progress meter with this engine - * - * Calling this function more than once will result in an IllegalStateException - * - * @param meter a non-null progress meter - */ - public void registerProgressMeter(final ProgressMeter meter) { - if ( meter == null ) throw new IllegalArgumentException("Meter cannot be null"); - if ( progressMeter != null ) throw new IllegalStateException("Progress meter already set"); - - progressMeter = meter; - } - - /** - * Get the progress meter being used by this engine. May be null if no meter has been registered yet - * @return a potentially null pointer to the progress meter - */ - public ProgressMeter getProgressMeter() { - return progressMeter; - } - - /** - * Does the current runtime in unit exceed the runtime limit, if one has been provided? - * - * @return false if not limit was requested or if runtime <= the limit, true otherwise - */ - public boolean exceedsRuntimeLimit() { - if ( progressMeter == null ) - // not yet initialized or not set because of testing - return false; - - final long runtime = progressMeter.getRuntimeInNanosecondsUpdatedPeriodically(); - if ( runtime < 0 ) throw new IllegalArgumentException("runtime must be >= 0 but got " + runtime); - - if ( getArguments().maxRuntime == NO_RUNTIME_LIMIT ) - return false; - else { - final long maxRuntimeNano = getRuntimeLimitInNanoseconds(); - return runtime > maxRuntimeNano; - } - } - - /** - * @return the runtime limit in nanoseconds, or -1 if no limit was specified - */ - public long getRuntimeLimitInNanoseconds() { - return runtimeLimitInNanoseconds; - } - - /** - * Setup the runtime limits for this engine, updating the runtimeLimitInNanoseconds - * as appropriate - * - * @param args the GATKArgumentCollection to retrieve our runtime limits from - */ - private void setupRuntimeLimits(final GATKArgumentCollection args) { - if ( args.maxRuntime == NO_RUNTIME_LIMIT ) - runtimeLimitInNanoseconds = -1; - else if (args.maxRuntime < 0 ) - throw new UserException.BadArgumentValue("maxRuntime", "must be >= 0 or == -1 (meaning no limit) but received negative value " + args.maxRuntime); - else { - runtimeLimitInNanoseconds = TimeUnit.NANOSECONDS.convert(args.maxRuntime, args.maxRuntimeUnits); - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java deleted file mode 100644 index 08f892f97..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ /dev/null @@ -1,483 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.arguments; - -import net.sf.samtools.SAMFileReader; -import org.broadinstitute.sting.commandline.*; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.downsampling.DownsampleType; -import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; -import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; -import org.broadinstitute.sting.gatk.samples.PedigreeValidationType; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.baq.BAQ; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variant.GATKVCFIndexType; -import org.broadinstitute.sting.utils.variant.GATKVCFUtils; - -import java.io.File; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.concurrent.TimeUnit; - -/** - * @author aaron - * @version 1.0 - */ -public class GATKArgumentCollection { - - /* our version number */ - private float versionNumber = 1; - private String description = "GATK Arguments"; - - /** the constructor */ - public GATKArgumentCollection() { - } - - // parameters and their defaults - @Input(fullName = "input_file", shortName = "I", doc = "SAM or BAM file(s)", required = false) - public List samFiles = new ArrayList(); - - @Argument(fullName = "read_buffer_size", shortName = "rbs", doc="Number of reads per SAM file to buffer in memory", required = false) - public Integer readBufferSize = null; - - // -------------------------------------------------------------------------------------------------------------- - // - // GATKRunReport options - // - // -------------------------------------------------------------------------------------------------------------- - - @Argument(fullName = "phone_home", shortName = "et", doc="What kind of GATK run report should we generate? AWS is the default, can be NO_ET so nothing is posted to the run repository. Please see " + UserException.PHONE_HOME_DOCS_URL + " for details.", required = false) - public GATKRunReport.PhoneHomeOption phoneHomeType = GATKRunReport.PhoneHomeOption.AWS; - - @Argument(fullName = "gatk_key", shortName = "K", doc="GATK Key file. Required if running with -et NO_ET. Please see " + UserException.PHONE_HOME_DOCS_URL + " for details.", required = false) - public File gatkKeyFile = null; - - /** - * The GATKRunReport supports (as of GATK 2.2) tagging GATK runs with an arbitrary String tag that can be - * used to group together runs during later analysis. One use of this capability is to tag runs as GATK - * performance tests, so that the performance of the GATK over time can be assessed from the logs directly. - * - * Note that the tags do not conform to any ontology, so you are free to use any tags that you might find - * meaningful. - */ - @Argument(fullName = "tag", shortName = "tag", doc="Arbitrary tag string to identify this GATK run as part of a group of runs, for later analysis", required = false) - public String tag = "NA"; - - // -------------------------------------------------------------------------------------------------------------- - // - // General features - // - // -------------------------------------------------------------------------------------------------------------- - - @Argument(fullName = "read_filter", shortName = "rf", doc = "Specify filtration criteria to apply to each read individually", required = false) - public List readFilters = new ArrayList(); - - @ArgumentCollection - public IntervalArgumentCollection intervalArguments = new IntervalArgumentCollection(); - - @Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = false) - public File referenceFile = null; - - @Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run", required = false) - public boolean nonDeterministicRandomSeed = false; - - @Hidden - @Argument(fullName = "disableDithering",doc="Completely eliminates randomized dithering from rank sum tests. To be used in the testing framework where dynamic parallelism can result in differing numbers of calls to the random generator.") - public boolean disableDithering = false; - - @Argument(fullName = "maxRuntime", shortName = "maxRuntime", doc="If provided, that GATK will stop execution cleanly as soon after maxRuntime has been exceeded, truncating the run but not exiting with a failure. By default the value is interpreted in minutes, but this can be changed by maxRuntimeUnits", required = false) - public long maxRuntime = GenomeAnalysisEngine.NO_RUNTIME_LIMIT; - - @Argument(fullName = "maxRuntimeUnits", shortName = "maxRuntimeUnits", doc="The TimeUnit for maxRuntime", required = false) - public TimeUnit maxRuntimeUnits = TimeUnit.MINUTES; - - // -------------------------------------------------------------------------------------------------------------- - // - // Downsampling Arguments - // - // -------------------------------------------------------------------------------------------------------------- - /** - * Reads will be selected randomly to be removed from the pile based on the method described here. - */ - @Argument(fullName = "downsampling_type", shortName="dt", doc="Type of reads downsampling to employ at a given locus", required = false) - public DownsampleType downsamplingType = null; - - @Argument(fullName = "downsample_to_fraction", shortName = "dfrac", doc = "Fraction [0.0-1.0] of reads to downsample to", required = false) - public Double downsampleFraction = null; - - /** - * For locus-based traversals (LocusWalkers and ActiveRegionWalkers), downsample_to_coverage controls the - * maximum depth of coverage at each locus. For read-based traversals (ReadWalkers), it controls the - * maximum number of reads sharing the same alignment start position. For ReadWalkers you will typically need to use - * much lower dcov values than you would with LocusWalkers to see an effect. Note that this downsampling option does - * not produce an unbiased random sampling from all available reads at each locus: instead, the primary goal of the - * to-coverage downsampler is to maintain an even representation of reads from all alignment start positions when - * removing excess coverage. For a truly unbiased random sampling of reads, use -dfrac instead. Also note - * that the coverage target is an approximate goal that is not guaranteed to be met exactly: the downsampling - * algorithm will under some circumstances retain slightly more or less coverage than requested. - */ - @Argument(fullName = "downsample_to_coverage", shortName = "dcov", - doc = "Coverage [integer] to downsample to per locus (for locus walkers) or per alignment start position (for read walkers)", - required = false) - public Integer downsampleCoverage = null; - - /** - * Gets the downsampling method explicitly specified by the user. If the user didn't specify - * a default downsampling mechanism, return the default. - * @return The explicitly specified downsampling mechanism, or the default if none exists. - */ - public DownsamplingMethod getDownsamplingMethod() { - if ( downsamplingType == null && downsampleFraction == null && downsampleCoverage == null ) - return null; - - return new DownsamplingMethod(downsamplingType, downsampleCoverage, downsampleFraction); - } - - /** - * Set the downsampling method stored in the argument collection so that it is read back out when interrogating the command line arguments. - * @param method The downsampling mechanism. - */ - public void setDownsamplingMethod(DownsamplingMethod method) { - if (method == null) - throw new IllegalArgumentException("method is null"); - - downsamplingType = method.type; - downsampleCoverage = method.toCoverage; - downsampleFraction = method.toFraction; - } - - // -------------------------------------------------------------------------------------------------------------- - // - // BAQ arguments - // - // -------------------------------------------------------------------------------------------------------------- - @Argument(fullName = "baq", shortName="baq", doc="Type of BAQ calculation to apply in the engine", required = false) - public BAQ.CalculationMode BAQMode = BAQ.CalculationMode.OFF; - - @Argument(fullName = "baqGapOpenPenalty", shortName="baqGOP", doc="BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets", required = false) - public double BAQGOP = BAQ.DEFAULT_GOP; - - // -------------------------------------------------------------------------------------------------------------- - // - // quality encoding checking arguments - // - // -------------------------------------------------------------------------------------------------------------- - - /** - * Q0 == ASCII 33 according to the SAM specification, whereas Illumina encoding starts at Q64. The idea here is - * simple: we just iterate over all reads and subtract 31 from every quality score. - */ - @Argument(fullName = "fix_misencoded_quality_scores", shortName="fixMisencodedQuals", doc="Fix mis-encoded base quality scores", required = false) - public boolean FIX_MISENCODED_QUALS = false; - - @Argument(fullName = "allow_potentially_misencoded_quality_scores", shortName="allowPotentiallyMisencodedQuals", doc="Do not fail when encountering base qualities that are too high and that seemingly indicate a problem with the base quality encoding of the BAM file", required = false) - public boolean ALLOW_POTENTIALLY_MISENCODED_QUALS = false; - - @Argument(fullName="useOriginalQualities", shortName = "OQ", doc = "If set, use the original base quality scores from the OQ tag when present instead of the standard scores", required=false) - public Boolean useOriginalBaseQualities = false; - - @Argument(fullName="defaultBaseQualities", shortName = "DBQ", doc = "If reads are missing some or all base quality scores, this value will be used for all base quality scores", required=false) - public byte defaultBaseQualities = -1; - - // -------------------------------------------------------------------------------------------------------------- - // - // performance log arguments - // - // -------------------------------------------------------------------------------------------------------------- - - /** - * The file name for the GATK performance log output, or null if you don't want to generate the - * detailed performance logging table. This table is suitable for importing into R or any - * other analysis software that can read tsv files - */ - @Argument(fullName = "performanceLog", shortName="PF", doc="If provided, a GATK runtime performance log will be written to this file", required = false) - public File performanceLog = null; - - // -------------------------------------------------------------------------------------------------------------- - // - // BQSR arguments - // - // -------------------------------------------------------------------------------------------------------------- - - /** - * Enables on-the-fly recalibrate of base qualities. The covariates tables are produced by the BaseQualityScoreRecalibrator tool. - * Please be aware that one should only run recalibration with the covariates file created on the same input bam(s). - */ - @Input(fullName="BQSR", shortName="BQSR", required=false, doc="The input covariates table file which enables on-the-fly base quality score recalibration (intended for use with BaseRecalibrator and PrintReads)") - public File BQSR_RECAL_FILE = null; - - /** - * Turns on the base quantization module. It requires a recalibration report (-BQSR). - * - * A value of 0 here means "do not quantize". - * Any value greater than zero will be used to recalculate the quantization using that many levels. - * Negative values mean that we should quantize using the recalibration report's quantization level. - */ - @Hidden - @Argument(fullName="quantize_quals", shortName = "qq", doc = "Quantize quality scores to a given number of levels (with -BQSR)", required=false) - public int quantizationLevels = 0; - - /** - * Turns off printing of the base insertion and base deletion tags when using the -BQSR argument and only the base substitution qualities will be produced. - */ - @Argument(fullName="disable_indel_quals", shortName = "DIQ", doc = "If true, disables printing of base insertion and base deletion tags (with -BQSR)", required=false) - public boolean disableIndelQuals = false; - - /** - * By default, the OQ tag in not emitted when using the -BQSR argument. - */ - @Argument(fullName="emit_original_quals", shortName = "EOQ", doc = "If true, enables printing of the OQ tag with the original base qualities (with -BQSR)", required=false) - public boolean emitOriginalQuals = false; - - /** - * Do not modify quality scores less than this value but rather just write them out unmodified in the recalibrated BAM file. - * In general it's unsafe to change qualities scores below < 6, since base callers use these values to indicate random or bad bases. - * For example, Illumina writes Q2 bases when the machine has really gone wrong. This would be fine in and of itself, - * but when you select a subset of these reads based on their ability to align to the reference and their dinucleotide effect, - * your Q2 bin can be elevated to Q8 or Q10, leading to issues downstream. - */ - @Argument(fullName = "preserve_qscores_less_than", shortName = "preserveQ", doc = "Bases with quality scores less than this threshold won't be recalibrated (with -BQSR)", required = false) - public int PRESERVE_QSCORES_LESS_THAN = QualityUtils.MIN_USABLE_Q_SCORE; - - @Argument(fullName = "globalQScorePrior", shortName = "globalQScorePrior", doc = "The global Qscore Bayesian prior to use in the BQSR. If specified, this value will be used as the prior for all mismatch quality scores instead of the actual reported quality score", required = false) - public double globalQScorePrior = -1.0; - - /** - * For the sake of your data, please only use this option if you know what you are doing. It is absolutely not recommended practice - * to run base quality score recalibration on reduced BAM files. - */ - @Advanced - @Argument(fullName = "allow_bqsr_on_reduced_bams_despite_repeated_warnings", shortName="allowBqsrOnReducedBams", doc="Do not fail when running base quality score recalibration on a reduced BAM file even though we highly recommend against it", required = false) - public boolean ALLOW_BQSR_ON_REDUCED_BAMS = false; - - // -------------------------------------------------------------------------------------------------------------- - // - // Other utility arguments - // - // -------------------------------------------------------------------------------------------------------------- - - @Argument(fullName = "validation_strictness", shortName = "S", doc = "How strict should we be with validation", required = false) - public SAMFileReader.ValidationStringency strictnessLevel = SAMFileReader.ValidationStringency.SILENT; - - @Argument(fullName = "remove_program_records", shortName = "rpr", doc = "Should we override the Walker's default and remove program records from the SAM header", required = false) - public boolean removeProgramRecords = false; - - @Argument(fullName = "keep_program_records", shortName = "kpr", doc = "Should we override the Walker's default and keep program records from the SAM header", required = false) - public boolean keepProgramRecords = false; - - @Advanced - @Argument(fullName = "sample_rename_mapping_file", shortName = "sample_rename_mapping_file", - doc = "Rename sample IDs on-the-fly at runtime using the provided mapping file. This option requires that " + - "each BAM file listed in the mapping file have only a single sample specified in its header (though there " + - "may be multiple read groups for that sample). Each line of the mapping file must contain the absolute path " + - "to a BAM file, followed by whitespace, followed by the new sample name for that BAM file.", - required = false) - public File sampleRenameMappingFile = null; - - @Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument.", required = false) - public ValidationExclusion.TYPE unsafe; - - @Hidden - @Advanced - @Argument(fullName = "disable_auto_index_creation_and_locking_when_reading_rods", shortName = "disable_auto_index_creation_and_locking_when_reading_rods", - doc = "UNSAFE FOR GENERAL USE (FOR TEST SUITE USE ONLY). Disable both auto-generation of index files and index file locking " + - "when reading VCFs and other rods and an index isn't present or is out-of-date. The file locking necessary for auto index " + - "generation to work safely is prone to random failures/hangs on certain platforms, which makes it desirable to disable it " + - "for situations like test suite runs where the indices are already known to exist, however this option is unsafe in general " + - "because it allows reading from index files without first acquiring a lock.", - required = false) - public boolean disableAutoIndexCreationAndLockingWhenReadingRods = false; - - // -------------------------------------------------------------------------------------------------------------- - // - // Multi-threading arguments - // - // -------------------------------------------------------------------------------------------------------------- - - /** - * How many data threads should be allocated to this analysis? Data threads contains N cpu threads per - * data thread, and act as completely data parallel processing, increasing the memory usage of GATK - * by M data threads. Data threads generally scale extremely effectively, up to 24 cores - */ - @Argument(fullName = "num_threads", shortName = "nt", doc = "How many data threads should be allocated to running this analysis.", required = false) - public Integer numberOfDataThreads = 1; - - /** - * How many CPU threads should be allocated per data thread? Each CPU thread operates the map - * cycle independently, but may run into earlier scaling problems with IO than data threads. Has - * the benefit of not requiring X times as much memory per thread as data threads do, but rather - * only a constant overhead. - */ - @Argument(fullName="num_cpu_threads_per_data_thread", shortName = "nct", doc="How many CPU threads should be allocated per data thread to running this analysis?", required = false) - public int numberOfCPUThreadsPerDataThread = 1; - - @Argument(fullName="num_io_threads", shortName = "nit", doc="How many of the given threads should be allocated to IO", required = false) - @Hidden - public int numberOfIOThreads = 0; - - /** - * Enable GATK to monitor its own threading efficiency, at an itsy-bitsy tiny - * cost (< 0.1%) in runtime because of turning on the JavaBean. This is largely for - * debugging purposes. Note that this argument is not compatible with -nt, it only works with -nct. - */ - @Argument(fullName = "monitorThreadEfficiency", shortName = "mte", doc = "Enable GATK threading efficiency monitoring", required = false) - public Boolean monitorThreadEfficiency = false; - - @Argument(fullName = "num_bam_file_handles", shortName = "bfh", doc="The total number of BAM file handles to keep open simultaneously", required=false) - public Integer numberOfBAMFileHandles = null; - - @Input(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching : or a .txt file containing the filter strings one per line.", required = false) - public List readGroupBlackList = null; - - // -------------------------------------------------------------------------------------------------------------- - // - // PED (pedigree) support - // - // -------------------------------------------------------------------------------------------------------------- - - /** - *

Reads PED file-formatted tabular text files describing meta-data about the samples being - * processed in the GATK.

- * - * - * - *

The PED file is a white-space (space or tab) delimited file: the first six columns are mandatory:

- * - *
    - *
  • Family ID
  • - *
  • Individual ID
  • - *
  • Paternal ID
  • - *
  • Maternal ID
  • - *
  • Sex (1=male; 2=female; other=unknown)
  • - *
  • Phenotype
  • - *
- * - *

The IDs are alphanumeric: the combination of family and individual ID should uniquely identify a person. - * A PED file must have 1 and only 1 phenotype in the sixth column. The phenotype can be either a - * quantitative trait or an affection status column: GATK will automatically detect which type - * (i.e. based on whether a value other than 0, 1, 2 or the missing genotype code is observed).

- * - *

If an individual's sex is unknown, then any character other than 1 or 2 can be used.

- * - *

You can add a comment to a PED or MAP file by starting the line with a # character. The rest of that - * line will be ignored. Do not start any family IDs with this character therefore.

- * - *

Affection status should be coded:

- * - *
    - *
  • -9 missing
  • - *
  • 0 missing
  • - *
  • 1 unaffected
  • - *
  • 2 affected
  • - *
- * - *

If any value outside of -9,0,1,2 is detected than the samples are assumed - * to phenotype values are interpreted as string phenotype values. In this case -9 uniquely - * represents the missing value.

- * - *

Genotypes (column 7 onwards) cannot be specified to the GATK.

- * - *

For example, here are two individuals (one row = one person):

- * - *
-     *   FAM001  1  0 0  1  2
-     *   FAM001  2  0 0  1  2
-     * 
- * - *

Each -ped argument can be tagged with NO_FAMILY_ID, NO_PARENTS, NO_SEX, NO_PHENOTYPE to - * tell the GATK PED parser that the corresponding fields are missing from the ped file.

- * - *

Note that most GATK walkers do not use pedigree information. Walkers that require pedigree - * data should clearly indicate so in their arguments and will throw errors if required pedigree - * information is missing.

- */ - @Argument(fullName="pedigree", shortName = "ped", doc="Pedigree files for samples",required=false) - public List pedigreeFiles = Collections.emptyList(); - - /** - * Inline PED records (see -ped argument). Each -pedString STRING can contain one or more - * valid PED records (see -ped) separated by semi-colons. Supports all tags for each pedString - * as -ped supports - */ - @Argument(fullName="pedigreeString", shortName = "pedString", doc="Pedigree string for samples",required=false) - public List pedigreeStrings = Collections.emptyList(); - - /** - * How strict should we be in parsing the PED files? - */ - @Argument(fullName="pedigreeValidationType", shortName = "pedValidationType", doc="How strict should we be in validating the pedigree information?",required=false) - public PedigreeValidationType pedigreeValidationType = PedigreeValidationType.STRICT; - - // -------------------------------------------------------------------------------------------------------------- - // - // BAM indexing and sharding arguments - // - // -------------------------------------------------------------------------------------------------------------- - - @Argument(fullName="allow_intervals_with_unindexed_bam",doc="Allow interval processing with an unsupported BAM. NO INTEGRATION TESTS are available. Use at your own risk.",required=false) - @Hidden - public boolean allowIntervalsWithUnindexedBAM = false; - - // -------------------------------------------------------------------------------------------------------------- - // - // testing BCF2 - // - // -------------------------------------------------------------------------------------------------------------- - - @Argument(fullName="generateShadowBCF",shortName = "generateShadowBCF",doc="If provided, whenever we create a VCFWriter we will also write out a BCF file alongside it, for testing purposes",required=false) - @Hidden - public boolean generateShadowBCF = false; - // TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed - - // -------------------------------------------------------------------------------------------------------------- - // - // VCF/BCF index parameters - // - // -------------------------------------------------------------------------------------------------------------- - - /** - * Specify the Tribble indexing strategy to use for VCFs. - * - * LINEAR creates a LinearIndex with bins of equal width, specified by the Bin Width parameter - * INTERVAL creates an IntervalTreeIndex with bins with an equal amount of features, specified by the Features Per Bin parameter - * DYNAMIC_SEEK attempts to optimize for minimal seek time by choosing an appropriate strategy and parameter (user-supplied parameter is ignored) - * DYNAMIC_SIZE attempts to optimize for minimal index size by choosing an appropriate strategy and parameter (user-supplied parameter is ignored) - */ - - @Argument(fullName="variant_index_type",shortName = "variant_index_type",doc="which type of IndexCreator to use for VCF/BCF indices",required=false) - @Advanced - public GATKVCFIndexType variant_index_type = GATKVCFUtils.DEFAULT_INDEX_TYPE; - - @Argument(fullName="variant_index_parameter",shortName = "variant_index_parameter",doc="the parameter (bin width or features per bin) to pass to the VCF/BCF IndexCreator",required=false) - @Advanced - public int variant_index_parameter = GATKVCFUtils.DEFAULT_INDEX_PARAMETER; -} - diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java deleted file mode 100644 index 7077db49c..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ /dev/null @@ -1,463 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.executive; - -import com.google.java.contract.Ensures; -import net.sf.picard.reference.IndexedFastaSequenceFile; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.ReadMetrics; -import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; -import org.broadinstitute.sting.gatk.datasources.reads.Shard; -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.io.OutputTracker; -import org.broadinstitute.sting.gatk.iterators.NullSAMIterator; -import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; -import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; -import org.broadinstitute.sting.gatk.traversals.*; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.AutoFormattingTime; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.progressmeter.ProgressMeter; -import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor; - -import javax.management.JMException; -import javax.management.MBeanServer; -import javax.management.ObjectName; -import java.io.File; -import java.lang.management.ManagementFactory; -import java.util.*; - - -/** - * Created by IntelliJ IDEA. - * User: mhanna - * Date: Apr 26, 2009 - * Time: 12:37:23 PM - * - * General base class for all scheduling algorithms - * Shards and schedules data in manageable chunks. - * - * Creates N TraversalEngines for each data thread for the MicroScheduler. This is necessary - * because in the HMS case you have multiple threads executing a traversal engine independently, and - * these engines may need to create separate resources for efficiency or implementation reasons. For example, - * the nanoScheduler creates threads to implement the traversal, and this creation is instance specific. - * So each HMS thread needs to have it's own distinct copy of the traversal engine if it wants to have - * N data threads x M nano threads => N * M threads total. These are borrowed from this microscheduler - * and returned when done. Also allows us to tracks all created traversal engines so this microscheduler - * can properly shut them all down when the scheduling is done. - * - */ -public abstract class MicroScheduler implements MicroSchedulerMBean { - protected static final Logger logger = Logger.getLogger(MicroScheduler.class); - - /** - * The list of all Traversal engines we've created in this micro scheduler - */ - final List allCreatedTraversalEngines = new LinkedList(); - - /** - * All available engines. Engines are borrowed and returned when a subclass is actually - * going to execute the engine on some data. This allows us to have N copies for - * N data parallel executions, but without the dangerous code of having local - * ThreadLocal variables. - */ - final LinkedList availableTraversalEngines = new LinkedList(); - - /** - * Engines that have been allocated to a key already. - */ - final HashMap allocatedTraversalEngines = new HashMap(); - - /** - * Counts the number of instances of the class that are currently alive. - */ - private static int instanceNumber = 0; - - /** - * The engine invoking this scheduler. - */ - protected final GenomeAnalysisEngine engine; - - protected final IndexedFastaSequenceFile reference; - - private final SAMDataSource reads; - protected final Collection rods; - - private final MBeanServer mBeanServer; - private final ObjectName mBeanName; - - /** - * Threading efficiency monitor for tracking the resource utilization of the GATK - * - * may be null - */ - ThreadEfficiencyMonitor threadEfficiencyMonitor = null; - - /** - * MicroScheduler factory function. Create a microscheduler appropriate for reducing the - * selected walker. - * - * @param walker Which walker to use. - * @param reads the informations associated with the reads - * @param reference the reference file - * @param rods the rods to include in the traversal - * @param threadAllocation Number of threads to utilize. - * - * @return The best-fit microscheduler. - */ - public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, ThreadAllocation threadAllocation) { - if ( threadAllocation.isRunningInParallelMode() ) { - logger.info(String.format("Running the GATK in parallel mode with %d total threads, " + - "%d CPU thread(s) for each of %d data thread(s), of %d processors available on this machine", - threadAllocation.getTotalNumThreads(), - threadAllocation.getNumCPUThreadsPerDataThread(), - threadAllocation.getNumDataThreads(), - Runtime.getRuntime().availableProcessors())); - if ( threadAllocation.getTotalNumThreads() > Runtime.getRuntime().availableProcessors() ) - logger.warn(String.format("Number of requested GATK threads %d is more than the number of " + - "available processors on this machine %d", threadAllocation.getTotalNumThreads(), - Runtime.getRuntime().availableProcessors())); - } - - if ( threadAllocation.getNumDataThreads() > 1 ) { - if (walker.isReduceByInterval()) - throw new UserException.BadArgumentValue("nt", String.format("The analysis %s aggregates results by interval. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); - - if ( ! (walker instanceof TreeReducible) ) { - throw badNT("nt", engine, walker); - } - } - - if ( threadAllocation.getNumCPUThreadsPerDataThread() > 1 && ! (walker instanceof NanoSchedulable) ) { - throw badNT("nct", engine, walker); - } - - if ( threadAllocation.getNumDataThreads() > 1 ) { - return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation); - } else { - return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation); - } - } - - private static UserException badNT(final String parallelArg, final GenomeAnalysisEngine engine, final Walker walker) { - throw new UserException.BadArgumentValue(parallelArg, - String.format("The analysis %s currently does not support parallel execution with %s. " + - "Please run your analysis without the %s option.", engine.getWalkerName(walker.getClass()), parallelArg, parallelArg)); - } - - /** - * Create a microscheduler given the reads and reference. - * - * @param walker the walker to execute with - * @param reads The reads. - * @param reference The reference. - * @param rods the rods to include in the traversal - * @param threadAllocation the allocation of threads to use in the underlying traversal - */ - protected MicroScheduler(final GenomeAnalysisEngine engine, - final Walker walker, - final SAMDataSource reads, - final IndexedFastaSequenceFile reference, - final Collection rods, - final ThreadAllocation threadAllocation) { - this.engine = engine; - this.reads = reads; - this.reference = reference; - this.rods = rods; - - final File progressLogFile = engine.getArguments() == null ? null : engine.getArguments().performanceLog; - - // Creates uninitialized TraversalEngines appropriate for walker and threadAllocation, - // and adds it to the list of created engines for later shutdown. - for ( int i = 0; i < threadAllocation.getNumDataThreads(); i++ ) { - final TraversalEngine traversalEngine = createTraversalEngine(walker, threadAllocation); - allCreatedTraversalEngines.add(traversalEngine); - availableTraversalEngines.add(traversalEngine); - } - - // Create the progress meter, and register it with the analysis engine - engine.registerProgressMeter(new ProgressMeter(progressLogFile, - availableTraversalEngines.peek().getTraversalUnits(), - engine.getRegionsOfGenomeBeingProcessed())); - - // Now that we have a progress meter, go through and initialize the traversal engines - for ( final TraversalEngine traversalEngine : allCreatedTraversalEngines ) - traversalEngine.initialize(engine, walker, engine.getProgressMeter()); - - // JMX does not allow multiple instances with the same ObjectName to be registered with the same platform MXBean. - // To get around this limitation and since we have no job identifier at this point, register a simple counter that - // will count the number of instances of this object that have been created in this JVM. - int thisInstance = instanceNumber++; - mBeanServer = ManagementFactory.getPlatformMBeanServer(); - try { - mBeanName = new ObjectName("org.broadinstitute.sting.gatk.executive:type=MicroScheduler,instanceNumber="+thisInstance); - mBeanServer.registerMBean(this, mBeanName); - } - catch (JMException ex) { - throw new ReviewedStingException("Unable to register microscheduler with JMX", ex); - } - } - - /** - * Really make us a traversal engine of the appropriate type for walker and thread allocation - * - * @return a non-null uninitialized traversal engine - */ - @Ensures("result != null") - private TraversalEngine createTraversalEngine(final Walker walker, final ThreadAllocation threadAllocation) { - if (walker instanceof ReadWalker) { - return new TraverseReadsNano(threadAllocation.getNumCPUThreadsPerDataThread()); - } else if (walker instanceof LocusWalker) { - return new TraverseLociNano(threadAllocation.getNumCPUThreadsPerDataThread()); - } else if (walker instanceof DuplicateWalker) { - return new TraverseDuplicates(); - } else if (walker instanceof ReadPairWalker) { - return new TraverseReadPairs(); - } else if (walker instanceof ActiveRegionWalker) { - return new TraverseActiveRegions(threadAllocation.getNumCPUThreadsPerDataThread()); - } else { - throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type."); - } - } - - - /** - * Return the ThreadEfficiencyMonitor we are using to track our resource utilization, if there is one - * - * @return the monitor, or null if none is active - */ - public ThreadEfficiencyMonitor getThreadEfficiencyMonitor() { - return threadEfficiencyMonitor; - } - - /** - * Inform this Microscheduler to use the efficiency monitor used to create threads in subclasses - * - * @param threadEfficiencyMonitor - */ - public void setThreadEfficiencyMonitor(final ThreadEfficiencyMonitor threadEfficiencyMonitor) { - this.threadEfficiencyMonitor = threadEfficiencyMonitor; - } - - /** - * Should we stop all execution work and exit gracefully? - * - * Returns true in the case where some external signal or time limit has been received, indicating - * that this GATK shouldn't continue executing. This isn't a kill signal, it is really a "shutdown - * gracefully at the next opportunity" signal. Concrete implementations of the MicroScheduler - * examine this value as often as reasonable and, if it returns true, stop what they are doing - * at the next available opportunity, shutdown their resources, call notify done, and return. - * - * @return true if we should abort execution, or false otherwise - */ - protected boolean abortExecution() { - final boolean abort = engine.exceedsRuntimeLimit(); - if ( abort ) { - final AutoFormattingTime aft = new AutoFormattingTime(engine.getRuntimeLimitInNanoseconds(), -1, 4); - logger.info("Aborting execution (cleanly) because the runtime has exceeded the requested maximum " + aft); - } - return abort; - } - - /** - * Walks a walker over the given list of intervals. - * - * @param walker Computation to perform over dataset. - * @param shardStrategy A strategy for sharding the data. - * - * @return the return type of the walker - */ - public abstract Object execute(Walker walker, Iterable shardStrategy); - - /** - * Tells this MicroScheduler that the execution of one of the subclass of this object as started - * - * Must be called when the implementation of execute actually starts up - * - * Currently only starts the progress meter timer running, but other start up activities could be incorporated - */ - protected void startingExecution() { - engine.getProgressMeter().start(); - } - - /** - * Retrieves the object responsible for tracking and managing output. - * @return An output tracker, for loading data in and extracting results. Will not be null. - */ - public abstract OutputTracker getOutputTracker(); - - /** - * Gets the an iterator over the given reads, which will iterate over the reads in the given shard. - * @param shard the shard to use when querying reads. - * @return an iterator over the reads specified in the shard. - */ - protected StingSAMIterator getReadIterator(Shard shard) { - return (!reads.isEmpty()) ? reads.seek(shard) : new NullSAMIterator(); - } - - /** - * Must be called by subclasses when execute is done - */ - protected void executionIsDone() { - engine.getProgressMeter().notifyDone(engine.getCumulativeMetrics().getNumIterations()); - printReadFilteringStats(); - shutdownTraversalEngines(); - - // Print out the threading efficiency of this HMS, if state monitoring is enabled - if ( threadEfficiencyMonitor != null ) { - // include the master thread information - threadEfficiencyMonitor.threadIsDone(Thread.currentThread()); - threadEfficiencyMonitor.printUsageInformation(logger); - } - } - - /** - * Shutdown all of the created engines, and clear the list of created engines, dropping - * pointers to the traversal engines - */ - public synchronized void shutdownTraversalEngines() { - for ( final TraversalEngine te : allCreatedTraversalEngines) - te.shutdown(); - - allCreatedTraversalEngines.clear(); - availableTraversalEngines.clear(); - } - - /** - * Prints out information about number of reads observed and filtering, if any reads were used in the traversal - * - * Looks like: - * - * INFO 10:40:47,370 MicroScheduler - 22 reads were filtered out during traversal out of 101 total (21.78%) - * INFO 10:40:47,370 MicroScheduler - -> 1 reads (0.99% of total) failing BadMateFilter - * INFO 10:40:47,370 MicroScheduler - -> 20 reads (19.80% of total) failing DuplicateReadFilter - * INFO 10:40:47,370 MicroScheduler - -> 1 reads (0.99% of total) failing FailsVendorQualityCheckFilter - */ - private void printReadFilteringStats() { - final ReadMetrics cumulativeMetrics = engine.getCumulativeMetrics(); - if ( cumulativeMetrics.getNumReadsSeen() > 0 ) { - // count up the number of skipped reads by summing over all filters - long nSkippedReads = 0L; - for ( final long countsByFilter : cumulativeMetrics.getCountsByFilter().values()) - nSkippedReads += countsByFilter; - - logger.info(String.format("%d reads were filtered out during the traversal out of approximately %d total reads (%.2f%%)", - nSkippedReads, - cumulativeMetrics.getNumReadsSeen(), - 100.0 * MathUtils.ratio(nSkippedReads, cumulativeMetrics.getNumReadsSeen()))); - - for ( final Map.Entry filterCounts : cumulativeMetrics.getCountsByFilter().entrySet() ) { - long count = filterCounts.getValue(); - logger.info(String.format(" -> %d reads (%.2f%% of total) failing %s", - count, 100.0 * MathUtils.ratio(count,cumulativeMetrics.getNumReadsSeen()), filterCounts.getKey())); - } - } - } - - /** - * Gets the engine that created this microscheduler. - * @return The engine owning this microscheduler. - */ - public GenomeAnalysisEngine getEngine() { return engine; } - - /** - * Returns data source maintained by this scheduler - * @return - */ - public SAMDataSource getSAMDataSource() { return reads; } - - /** - * Returns the reference maintained by this scheduler. - * @return The reference maintained by this scheduler. - */ - public IndexedFastaSequenceFile getReference() { return reference; } - - protected void cleanup() { - try { - mBeanServer.unregisterMBean(mBeanName); - } - catch (JMException ex) { - throw new ReviewedStingException("Unable to unregister microscheduler with JMX", ex); - } - } - - /** - * Returns a traversal engine suitable for use, associated with key - * - * Key is an arbitrary object that is used to retrieve the same traversal - * engine over and over. This can be important in the case where the - * traversal engine has data associated with it in some other context, - * and we need to ensure that the context always sees the same traversal - * engine. This happens in the HierarchicalMicroScheduler, where you want - * the a thread executing traversals to retrieve the same engine each time, - * as outputs are tracked w.r.t. that engine. - * - * If no engine is associated with key yet, pops the next available engine - * from the available ones maintained by this - * microscheduler. Note that it's a runtime error to pop a traversal engine - * from this scheduler if there are none available. Callers that - * once pop'd an engine for use must return it with returnTraversalEngine - * - * @param key the key to associate with this engine - * @return a non-null TraversalEngine suitable for execution in this scheduler - */ - @Ensures("result != null") - protected synchronized TraversalEngine borrowTraversalEngine(final Object key) { - if ( key == null ) throw new IllegalArgumentException("key cannot be null"); - - final TraversalEngine engine = allocatedTraversalEngines.get(key); - if ( engine == null ) { - if ( availableTraversalEngines.isEmpty() ) - throw new IllegalStateException("no traversal engines were available"); - allocatedTraversalEngines.put(key, availableTraversalEngines.pop()); - return allocatedTraversalEngines.get(key); - } else { - return engine; - } - } - - /** - * Return a borrowed traversal engine to this MicroScheduler, for later use - * in another traversal execution - * - * @param key the key used to id the engine, provided to the borrowTraversalEngine function - * @param traversalEngine the borrowed traversal engine. Must have been previously borrowed. - */ - protected synchronized void returnTraversalEngine(final Object key, final TraversalEngine traversalEngine) { - if ( traversalEngine == null ) - throw new IllegalArgumentException("Attempting to push a null traversal engine"); - if ( ! allCreatedTraversalEngines.contains(traversalEngine) ) - throw new IllegalArgumentException("Attempting to push a traversal engine not created by this MicroScheduler" + engine); - if ( ! allocatedTraversalEngines.containsKey(key) ) - throw new IllegalArgumentException("No traversal engine was never checked out with key " + key); - - // note there's nothing to actually do here, but a function implementation - // might want to do something - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/IndexDictionaryUtils.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/IndexDictionaryUtils.java deleted file mode 100644 index e0b5dd4cb..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/IndexDictionaryUtils.java +++ /dev/null @@ -1,107 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.refdata.tracks; - -import net.sf.samtools.SAMSequenceDictionary; -import net.sf.samtools.SAMSequenceRecord; -import org.apache.log4j.Logger; -import org.broad.tribble.index.Index; -import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; -import org.broadinstitute.sting.utils.SequenceDictionaryUtils; - -import java.util.LinkedHashSet; -import java.util.Map; -import java.util.Set; -import java.util.TreeSet; - -/** - * Utilities for working with Sequence Dictionaries embedded in tribble indices - * - * @author Your Name - * @since Date created - */ -public class IndexDictionaryUtils { - private final static Logger logger = Logger.getLogger(IndexDictionaryUtils.class); - - // a constant we use for marking sequence dictionary entries in the Tribble index property list - public static final String SequenceDictionaryPropertyPredicate = "DICT:"; - - /** - * get the sequence dictionary from the track, if available. If not, make it from the contig list that is always in the index - * @param index the index file to use - * @return a SAMSequenceDictionary if available, null if unavailable - */ - public static SAMSequenceDictionary getSequenceDictionaryFromProperties(Index index) { - SAMSequenceDictionary dict = new SAMSequenceDictionary(); - for (Map.Entry entry : index.getProperties().entrySet()) { - if (entry.getKey().startsWith(SequenceDictionaryPropertyPredicate)) - dict.addSequence(new SAMSequenceRecord(entry.getKey().substring(SequenceDictionaryPropertyPredicate.length() , entry.getKey().length()), - Integer.valueOf(entry.getValue()))); - } - return dict; - } - - /** - * create the sequence dictionary with the contig list; a backup approach - * @param index the index file to use - * @param dict the sequence dictionary to add contigs to - * @return the filled-in sequence dictionary - */ - static SAMSequenceDictionary createSequenceDictionaryFromContigList(Index index, SAMSequenceDictionary dict) { - LinkedHashSet seqNames = index.getSequenceNames(); - if (seqNames == null) { - return dict; - } - for (String name : seqNames) { - SAMSequenceRecord seq = new SAMSequenceRecord(name, 0); - dict.addSequence(seq); - } - return dict; - } - - public static void setIndexSequenceDictionary(Index index, SAMSequenceDictionary dict) { - for ( SAMSequenceRecord seq : dict.getSequences() ) { - final String contig = IndexDictionaryUtils.SequenceDictionaryPropertyPredicate + seq.getSequenceName(); - final String length = String.valueOf(seq.getSequenceLength()); - index.addProperty(contig,length); - } - } - - public static void validateTrackSequenceDictionary(final String trackName, - final SAMSequenceDictionary trackDict, - final SAMSequenceDictionary referenceDict, - final ValidationExclusion.TYPE validationExclusionType ) { - // if the sequence dictionary is empty (as well as null which means it doesn't have a dictionary), skip validation - if (trackDict == null || trackDict.size() == 0) - logger.info("Track " + trackName + " doesn't have a sequence dictionary built in, skipping dictionary validation"); - else { - Set trackSequences = new TreeSet(); - for (SAMSequenceRecord dictionaryEntry : trackDict.getSequences()) - trackSequences.add(dictionaryEntry.getSequenceName()); - SequenceDictionaryUtils.validateDictionaries(logger, validationExclusionType, trackName, trackDict, "reference", referenceDict, false, null); - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java deleted file mode 100644 index 4c50cfaae..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java +++ /dev/null @@ -1,416 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.refdata.tracks; - -import net.sf.samtools.SAMSequenceDictionary; -import org.apache.log4j.Logger; -import org.broad.tribble.AbstractFeatureReader; -import org.broad.tribble.FeatureCodec; -import org.broad.tribble.Tribble; -import org.broad.tribble.TribbleException; -import org.broad.tribble.index.Index; -import org.broad.tribble.index.IndexFactory; -import org.broad.tribble.util.LittleEndianOutputStream; -import org.broadinstitute.sting.commandline.Tags; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; -import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; -import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet.RMDStorageType; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.file.FSLockWithShared; -import org.broadinstitute.sting.utils.instrumentation.Sizeof; - -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; - - -/** - * - * @author aaron - * ` - * Class RMDTrackBuilder - * - * This class keeps track of the available codecs, and knows how to put together a track of - * that gets iterators from the FeatureReader using Tribble. - * - */ -public class RMDTrackBuilder { // extends PluginManager { - /** - * our log, which we use to capture anything from this class - */ - private final static Logger logger = Logger.getLogger(RMDTrackBuilder.class); - - // private sequence dictionary we use to set our tracks with - private final SAMSequenceDictionary dict; - - /** - * Private genome loc parser to use when building out new locs. - */ - private final GenomeLocParser genomeLocParser; - - /** - * Validation exclusions, for validating the sequence dictionary. - */ - private ValidationExclusion.TYPE validationExclusionType; - - private final FeatureManager featureManager; - - // If true, do not attempt to create index files if they don't exist or are outdated, and don't - // make any file lock acquisition calls on the index files. - private final boolean disableAutoIndexCreation; - - /** - * Construct an RMDTrackerBuilder, allowing the user to define tracks to build after-the-fact. This is generally - * used when walkers want to directly manage the ROD system for whatever reason. Before using this constructor, - * please talk through your approach with the SE team. - * @param dict Sequence dictionary to use. - * @param genomeLocParser Location parser to use. - * @param validationExclusionType Types of validations to exclude, for sequence dictionary verification. - * @param disableAutoIndexCreation Do not auto-create index files, and do not use file locking when accessing index files. - * UNSAFE in general (because it causes us not to lock index files before reading them) -- - * suitable only for test suite use. - */ - public RMDTrackBuilder(final SAMSequenceDictionary dict, - final GenomeLocParser genomeLocParser, - final ValidationExclusion.TYPE validationExclusionType, - final boolean disableAutoIndexCreation) { - this.dict = dict; - this.validationExclusionType = validationExclusionType; - this.genomeLocParser = genomeLocParser; - this.featureManager = new FeatureManager(GenomeAnalysisEngine.lenientVCFProcessing(validationExclusionType)); - this.disableAutoIndexCreation = disableAutoIndexCreation; - } - - /** - * Return the feature manager this RMDTrackBuilder is using the create tribble tracks - * - * @return - */ - public FeatureManager getFeatureManager() { - return featureManager; - } - - /** - * create a RMDTrack of the specified type - * - * @param fileDescriptor a description of the type of track to build. - * - * @return an instance of the track - */ - public RMDTrack createInstanceOfTrack(RMDTriplet fileDescriptor) { - String name = fileDescriptor.getName(); - File inputFile = new File(fileDescriptor.getFile()); - - FeatureManager.FeatureDescriptor descriptor = getFeatureManager().getByTriplet(fileDescriptor); - if (descriptor == null) - throw new UserException.BadArgumentValue("-B",fileDescriptor.getType()); - - // return a feature reader track - Pair pair; - if (inputFile.getAbsolutePath().endsWith(".gz")) - pair = createTabixIndexedFeatureSource(descriptor, name, inputFile); - else - pair = getFeatureSource(descriptor, name, inputFile, fileDescriptor.getStorageType()); - if (pair == null) throw new UserException.CouldNotReadInputFile(inputFile, "Unable to make the feature reader for input file"); - return new RMDTrack(descriptor.getCodecClass(), name, inputFile, pair.first, pair.second, genomeLocParser, createCodec(descriptor, name)); - } - - /** - * Convenience method simplifying track creation. Assume unnamed track based on a file rather than a stream. - * @param codecClass Type of Tribble codec class to build. - * @param inputFile Input file type to use. - * @return An RMDTrack, suitable for accessing reference metadata. - */ - public RMDTrack createInstanceOfTrack(Class codecClass, File inputFile) { - final FeatureManager.FeatureDescriptor descriptor = getFeatureManager().getByCodec(codecClass); - - if (descriptor == null) - throw new ReviewedStingException("Unable to find type name for codec class " + codecClass.getName()); - - return createInstanceOfTrack(new RMDTriplet("anonymous",descriptor.getName(),inputFile.getAbsolutePath(),RMDStorageType.FILE,new Tags())); - } - - /** - * create a feature reader, without assuming there exists an index. This code assumes the feature - * reader of the appropriate type will figure out what the right index type is, and determine if it - * exists. - * - * @param descriptor the FeatureDescriptor describing the FeatureCodec we want to create - * @param name the name of the track - * @param inputFile the file to load - * @return a feature reader implementation - */ - private Pair createTabixIndexedFeatureSource(FeatureManager.FeatureDescriptor descriptor, String name, File inputFile) { - // we might not know the index type, try loading with the default reader constructor - logger.info("Attempting to blindly load " + inputFile + " as a tabix indexed file"); - try { - return new Pair(AbstractFeatureReader.getFeatureReader(inputFile.getAbsolutePath(), createCodec(descriptor, name)),null); - } catch (TribbleException e) { - throw new UserException(e.getMessage(), e); - } - } - - /** - * add a name to the codec, if it takes one - * @param descriptor the class to create a codec for - * @param name the name to assign this codec - * @return the feature codec itself - */ - private FeatureCodec createCodec(FeatureManager.FeatureDescriptor descriptor, String name) { - return featureManager.createCodec(descriptor, name, genomeLocParser); - } - - /** - * create a feature source object given: - * @param descriptor the FeatureDescriptor describing the FeatureCodec we want to create - * @param name the name of the codec - * @param inputFile the tribble file to parse - * @param storageType How the RMD is streamed into the input file. - * @return the input file as a FeatureReader - */ - private Pair getFeatureSource(FeatureManager.FeatureDescriptor descriptor, - String name, - File inputFile, - RMDStorageType storageType) { - // Feature source and sequence dictionary to use as the ultimate reference - AbstractFeatureReader featureSource = null; - SAMSequenceDictionary sequenceDictionary = null; - - // Detect whether or not this source should be indexed. - boolean canBeIndexed = (storageType == RMDStorageType.FILE); - - if(canBeIndexed) { - try { - Index index = loadIndex(inputFile, createCodec(descriptor, name)); - try { logger.info(String.format(" Index for %s has size in bytes %d", inputFile, Sizeof.getObjectGraphSize(index))); } - catch (ReviewedStingException e) { } - - sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index); - - // if we don't have a dictionary in the Tribble file, and we've set a dictionary for this builder, set it in the file if they match - if (sequenceDictionary.size() == 0 && dict != null) { - validateAndUpdateIndexSequenceDictionary(inputFile, index, dict); - - if ( ! disableAutoIndexCreation ) { - File indexFile = Tribble.indexFile(inputFile); - try { // re-write the index - writeIndexToDisk(index,indexFile,new FSLockWithShared(indexFile)); - } catch (IOException e) { - logger.warn("Unable to update index with the sequence dictionary for file " + indexFile + "; this will not affect your run of the GATK"); - } - } - - sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index); - } - - featureSource = AbstractFeatureReader.getFeatureReader(inputFile.getAbsolutePath(), createCodec(descriptor, name), index); - } - catch (TribbleException e) { - throw new UserException(e.getMessage()); - } - catch (IOException e) { - throw new UserException("I/O error loading or writing tribble index file for " + inputFile.getAbsolutePath(), e); - } - } - else { - featureSource = AbstractFeatureReader.getFeatureReader(inputFile.getAbsolutePath(), createCodec(descriptor, name), false); - } - - return new Pair(featureSource,sequenceDictionary); - } - - /** - * create an index for the input file - * @param inputFile the input file - * @param codec the codec to use - * @return a linear index for the specified type - * @throws IOException if we cannot write the index file - */ - public synchronized Index loadIndex( final File inputFile, final FeatureCodec codec) throws IOException { - final File indexFile = Tribble.indexFile(inputFile); - final FSLockWithShared lock = new FSLockWithShared(indexFile); - Index idx = null; - - // If the index file exists and is readable, attempt to load it from disk. We'll get null back - // if a problem was discovered with the index file when it was inspected, and we'll get an - // in-memory index back in the case where the index file could not be locked. - if (indexFile.canRead()) { - idx = disableAutoIndexCreation ? loadFromDisk(inputFile, indexFile) // load without locking if we're in disableAutoIndexCreation mode - : attemptToLockAndLoadIndexFromDisk(inputFile, codec, indexFile, lock); - } - - // If we have an index, it means we either loaded it from disk without issue or we created an in-memory - // index due to not being able to acquire a lock. - if (idx != null) return idx; - - // We couldn't read the file, or we discovered a problem with the index file, so continue on to making a new index - idx = createIndexInMemory(inputFile, codec); - if ( ! disableAutoIndexCreation ) { - writeIndexToDisk(idx, indexFile, lock); - } - return idx; - } - - /** - * Attempt to acquire a shared lock and then load the index from disk. Returns an in-memory index if - * a lock could not be obtained. Returns null if a problem was discovered with the index file when it - * was examined (eg., it was out-of-date). - * - * @param inputFile the input file - * @param codec the codec to read from - * @param indexFile the index file itself - * @param lock the lock file - * @return an index, or null if we couldn't load one - * @throws IOException if we fail for FS issues - */ - protected Index attemptToLockAndLoadIndexFromDisk( final File inputFile, final FeatureCodec codec, final File indexFile, final FSLockWithShared lock ) throws IOException { - boolean locked = false; - Index idx = null; - - try { - locked = lock.sharedLock(); - - if ( ! locked ) { // can't lock file - logger.info(String.format("Could not acquire a shared lock on index file %s, falling back to using an in-memory index for this GATK run.", - indexFile.getAbsolutePath())); - idx = createIndexInMemory(inputFile, codec); - } - else { - idx = loadFromDisk(inputFile, indexFile); - } - } finally { - if (locked) lock.unlock(); - } - return idx; - } - - /** - * load the index from disk, checking for out of date indexes and old versions (both of which are deleted) - * @param inputFile the input file - * @param indexFile the input file, plus the index extension - * @return an Index, or null if we're unable to load - */ - protected Index loadFromDisk( final File inputFile, final File indexFile ) { - logger.info("Loading Tribble index from disk for file " + inputFile); - Index index = IndexFactory.loadIndex(indexFile.getAbsolutePath()); - - // check if the file is up-to date (filestamp and version check) - if (index.isCurrentVersion() && indexFile.lastModified() >= inputFile.lastModified()) - return index; - else if (indexFile.lastModified() < inputFile.lastModified()) - logger.warn("Index file " + indexFile + " is out of date (index older than input file), " + - (disableAutoIndexCreation ? "falling back to an in-memory index" : "deleting and updating the index file")); - else // we've loaded an old version of the index, we want to remove it <-- currently not used, but may re-enable - logger.warn("Index file " + indexFile + " is out of date (old version), " + - (disableAutoIndexCreation ? "falling back to an in-memory index" : "deleting and updating the index file")); - - if ( ! disableAutoIndexCreation ) { - boolean deleted = indexFile.delete(); - if (!deleted) logger.warn("Index file " + indexFile + " is out of date, but could not be removed; it will not be trusted (we'll try to rebuild an in-memory copy)"); - } - - return null; - } - - - /** - * attempt to write the index to disk - * @param index the index to write to disk - * @param indexFile the index file location - * @param lock the locking object - * @throws IOException when unable to create the new index - */ - private void writeIndexToDisk( final Index index, final File indexFile, final FSLockWithShared lock ) throws IOException { - if ( disableAutoIndexCreation ) { - return; - } - - boolean locked = false; - - try { - locked = lock.exclusiveLock(); - - if (locked) { - logger.info("Writing Tribble index to disk for file " + indexFile); - LittleEndianOutputStream stream = new LittleEndianOutputStream(new FileOutputStream(indexFile)); - index.write(stream); - stream.close(); - } - else // we can't write it to disk, just store it in memory, tell them this - logger.warn("Unable to write to " + indexFile + " for the index file, creating index in memory only"); - - try { logger.info(String.format(" Index for %s has size in bytes %d", indexFile, Sizeof.getObjectGraphSize(index))); } - catch ( ReviewedStingException e) { } - } - finally { - if (locked) lock.unlock(); - } - - } - - /** - * create the index in memory, given the input file and feature codec - * @param inputFile the input file - * @param codec the codec - * @return a LinearIndex, given the file location - * @throws IOException when unable to create the index in memory - */ - protected Index createIndexInMemory(File inputFile, FeatureCodec codec) { - // this can take a while, let them know what we're doing - logger.info("Creating Tribble index in memory for file " + inputFile); - Index idx = IndexFactory.createDynamicIndex(inputFile, codec, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); - validateAndUpdateIndexSequenceDictionary(inputFile, idx, dict); - return idx; - } - - /** - * set the sequence dictionary of the track. This function checks that the contig listing of the underlying file is compatible. - * (that each contig in the index is in the sequence dictionary). - * @param inputFile for proper error message formatting. - * @param dict the sequence dictionary - * @param index the index file - */ - public void validateAndUpdateIndexSequenceDictionary(final File inputFile, final Index index, final SAMSequenceDictionary dict) { - if (dict == null) throw new ReviewedStingException("BUG: dict cannot be null"); - - // check that every contig in the RMD contig list is at least in the sequence dictionary we're being asked to set - final SAMSequenceDictionary currentDict = IndexDictionaryUtils.createSequenceDictionaryFromContigList(index, new SAMSequenceDictionary()); - validateTrackSequenceDictionary(inputFile.getAbsolutePath(), currentDict, dict); - - // actually update the dictionary in the index - IndexDictionaryUtils.setIndexSequenceDictionary(index, dict); - } - - public void validateTrackSequenceDictionary(final String trackName, - final SAMSequenceDictionary trackDict, - final SAMSequenceDictionary referenceDict ) { - IndexDictionaryUtils.validateTrackSequenceDictionary(trackName, trackDict, referenceDict, validationExclusionType); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java deleted file mode 100644 index ca3255097..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java +++ /dev/null @@ -1,1094 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.walkers.coverage; - -import net.sf.samtools.SAMReadGroupRecord; -import org.broadinstitute.sting.commandline.Advanced; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.downsampling.DownsampleType; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator; -import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; -import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator; -import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.codecs.refseq.RefSeqCodec; -import org.broadinstitute.sting.utils.codecs.refseq.RefSeqFeature; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.help.HelpConstants; - -import java.io.File; -import java.io.PrintStream; -import java.util.*; - -/** - * Toolbox for assessing sequence coverage by a wide array of metrics, partitioned by sample, read group, or library - * - *

- * Coverage processes a set of bam files to determine coverage at different levels of partitioning and - * aggregation. Coverage can be analyzed per locus, per interval, per gene, or in total; can be partitioned by - * sample, by read group, by technology, by center, or by library; and can be summarized by mean, median, quartiles, - * and/or percentage of bases covered to or beyond a threshold. - * Additionally, reads and bases can be filtered by mapping or base quality score. - * - *

Input

- *

- * One or more bam files (with proper headers) to be analyzed for coverage statistics - *

- *

- *(Optional) A REFSEQ Rod to aggregate coverage to the gene level - *

- * (for information about creating the REFSEQ Rod, please consult the RefSeqCodec documentation) - *

- *

Output

- *

- * Tables pertaining to different coverage summaries. Suffix on the table files declares the contents: - *

- * - no suffix: per locus coverage - *

- * - _summary: total, mean, median, quartiles, and threshold proportions, aggregated over all bases - *

- * - _statistics: coverage histograms (# locus with X coverage), aggregated over all bases - *

- * - _interval_summary: total, mean, median, quartiles, and threshold proportions, aggregated per interval - *

- * - _interval_statistics: 2x2 table of # of intervals covered to >= X depth in >=Y samples - *

- * - _gene_summary: total, mean, median, quartiles, and threshold proportions, aggregated per gene - *

- * - _gene_statistics: 2x2 table of # of genes covered to >= X depth in >= Y samples - *

- * - _cumulative_coverage_counts: coverage histograms (# locus with >= X coverage), aggregated over all bases - *

- * - _cumulative_coverage_proportions: proprotions of loci with >= X coverage, aggregated over all bases - *

- * - *

Examples

- *
- * java -Xmx2g -jar GenomeAnalysisTK.jar \
- *   -R ref.fasta \
- *   -T DepthOfCoverage \
- *   -o file_name_base \
- *   -I input_bams.list
- *   [-geneList refSeq.sorted.txt] \
- *   [-pt readgroup] \
- *   [-ct 4 -ct 6 -ct 10] \
- *   [-L my_capture_genes.interval_list]
- * 
- * - */ -// todo -- cache the map from sample names to means in the print functions, rather than regenerating each time -// todo -- support for granular histograms for total depth; maybe n*[start,stop], bins*sqrt(n) -// todo -- alter logarithmic scaling to spread out bins more -// todo -- allow for user to set linear binning (default is logarithmic) -// todo -- formatting --> do something special for end bins in getQuantile(int[] foo), this gets mushed into the end+-1 bins for now -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) -@By(DataSource.REFERENCE) -@PartitionBy(PartitionType.NONE) -@Downsample(by= DownsampleType.NONE, toCoverage=Integer.MAX_VALUE) -public class DepthOfCoverage extends LocusWalker>, CoveragePartitioner> implements TreeReducible { - @Output - @Multiplex(value=DoCOutputMultiplexer.class,arguments={"partitionTypes","refSeqGeneList","omitDepthOutput","omitIntervals","omitSampleSummary","omitLocusTable"}) - Map out; - - @Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth. Defaults to -1.", required = false) - int minMappingQuality = -1; - @Argument(fullName = "maxMappingQuality", doc = "Maximum mapping quality of reads to count towards depth. Defaults to 2^31-1 (Integer.MAX_VALUE).", required = false) - int maxMappingQuality = Integer.MAX_VALUE; - - @Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth. Defaults to -1.", required = false) - byte minBaseQuality = -1; - @Argument(fullName = "maxBaseQuality", doc = "Maximum quality of bases to count towards depth. Defaults to 127 (Byte.MAX_VALUE).", required = false) - byte maxBaseQuality = Byte.MAX_VALUE; - - @Argument(fullName = "countType", doc = "How should overlapping reads from the same fragment be handled?", required = false) - CoverageUtils.CountPileupType countType = CoverageUtils.CountPileupType.COUNT_READS; - - /** - * Instead of reporting depth, report the base pileup at each locus - */ - @Argument(fullName = "printBaseCounts", shortName = "baseCounts", doc = "Will add base counts to per-locus output.", required = false) - boolean printBaseCounts = false; - - /** - * Do not tabulate locus statistics (# loci covered by sample by coverage) - */ - @Argument(fullName = "omitLocusTable", shortName = "omitLocusTable", doc = "Will not calculate the per-sample per-depth counts of loci, which should result in speedup", required = false) - boolean omitLocusTable = false; - - /** - * Do not tabulate interval statistics (mean, median, quartiles AND # intervals by sample by coverage) - */ - @Argument(fullName = "omitIntervalStatistics", shortName = "omitIntervals", doc = "Will omit the per-interval statistics section, which should result in speedup", required = false) - boolean omitIntervals = false; - /** - * Do not print the total coverage at every base - */ - @Argument(fullName = "omitDepthOutputAtEachBase", shortName = "omitBaseOutput", doc = "Will omit the output of the depth of coverage at each base, which should result in speedup", required = false) - boolean omitDepthOutput = false; - - /** - * Path to the RefSeq file for use in aggregating coverage statistics over genes - */ - @Argument(fullName = "calculateCoverageOverGenes", shortName = "geneList", doc = "Calculate the coverage statistics over this list of genes. Currently accepts RefSeq.", required = false) - File refSeqGeneList = null; - - /** - * The format of the output file - */ - @Argument(fullName = "outputFormat", doc = "the format of the output file (e.g. csv, table, rtable); defaults to r-readable table", required = false) - String outputFormat = "rtable"; - - - // --------------------------------------------------------------------------- - // - // Advanced arguments - // - // --------------------------------------------------------------------------- - @Advanced - @Argument(fullName = "includeRefNSites", doc = "If provided, sites with reference N bases but with coverage from neighboring reads will be included in DoC calculations.", required = false) - boolean includeRefNBases = false; - - @Advanced - @Argument(fullName = "printBinEndpointsAndExit", doc = "Prints the bin values and exits immediately. Use to calibrate what bins you want before running on data.", required = false) - boolean printBinEndpointsAndExit = false; - - /** - * Sets the low-coverage cutoff for granular binning. All loci with depth < START are counted in the first bin. - */ - @Advanced - @Argument(fullName = "start", doc = "Starting (left endpoint) for granular binning", required = false) - int start = 1; - /** - * Sets the high-coverage cutoff for granular binning. All loci with depth > END are counted in the last bin. - */ - @Advanced - @Argument(fullName = "stop", doc = "Ending (right endpoint) for granular binning", required = false) - int stop = 500; - /** - * Sets the number of bins for granular binning - */ - @Advanced - @Argument(fullName = "nBins", doc = "Number of bins to use for granular binning", required = false) - int nBins = 499; - - /** - * Do not tabulate the sample summary statistics (total, mean, median, quartile coverage per sample) - */ - @Argument(fullName = "omitPerSampleStats", shortName = "omitSampleSummary", doc = "Omits the summary files per-sample. These statistics are still calculated, so this argument will not improve runtime.", required = false) - boolean omitSampleSummary = false; - /** - * A way of partitioning reads into groups. Can be sample, readgroup, or library. - */ - @Argument(fullName = "partitionType", shortName = "pt", doc = "Partition type for depth of coverage. Defaults to sample. Can be any combination of sample, readgroup, library.", required = false) - Set partitionTypes = EnumSet.of(DoCOutputType.Partition.sample); - - /** - * Consider a spanning deletion as contributing to coverage. Also enables deletion counts in per-base output. - */ - @Advanced - @Argument(fullName = "includeDeletions", shortName = "dels", doc = "Include information on deletions", required = false) - boolean includeDeletions = false; - - @Advanced - @Argument(fullName = "ignoreDeletionSites", doc = "Ignore sites consisting only of deletions", required = false) - boolean ignoreDeletionSites = false; - - /** - * A coverage threshold for summarizing (e.g. % bases >= CT for each sample) - */ - @Advanced - @Argument(fullName = "summaryCoverageThreshold", shortName = "ct", doc = "for summary file outputs, report the % of bases coverd to >= this number. Defaults to 15; can take multiple arguments.", required = false) - int[] coverageThresholds = {15}; - - String[] OUTPUT_FORMATS = {"table","rtable","csv"}; - String separator = "\t"; - Map> orderCheck = new HashMap>(); - - //////////////////////////////////////////////////////////////////////////////////// - // STANDARD WALKER METHODS - //////////////////////////////////////////////////////////////////////////////////// - - public boolean includeReadsWithDeletionAtLoci() { return includeDeletions && ! ignoreDeletionSites; } - - public void initialize() { - - if ( printBinEndpointsAndExit ) { - int[] endpoints = DepthOfCoverageStats.calculateBinEndpoints(start,stop,nBins); - System.out.print("[ "); - for ( int e : endpoints ) { - System.out.print(e+" "); - } - System.out.println("]"); - System.exit(0); - } - - // Check the output format - boolean goodOutputFormat = false; - for ( String f : OUTPUT_FORMATS ) { - goodOutputFormat = goodOutputFormat || f.equals(outputFormat); - } - - if ( ! goodOutputFormat ) { - throw new IllegalArgumentException("Improper output format. Can be one of table,rtable,csv. Was "+outputFormat); - } - - if ( outputFormat.equals("csv") ) { - separator = ","; - } - - if ( ! omitDepthOutput ) { // print header - PrintStream out = getCorrectStream(null, DoCOutputType.Aggregation.locus, DoCOutputType.FileType.summary); - out.printf("%s\t%s","Locus","Total_Depth"); - for (DoCOutputType.Partition type : partitionTypes ) { - out.printf("\t%s_%s","Average_Depth",type.toString()); - } - - // get all the samples - HashSet allSamples = getSamplesFromToolKit(partitionTypes); - ArrayList allSampleList = new ArrayList(allSamples.size()); - for ( String s : allSamples ) { - allSampleList.add(s); - } - Collections.sort(allSampleList); - - for ( String s : allSampleList) { - out.printf("\t%s_%s","Depth_for",s); - if ( printBaseCounts ) { - out.printf("\t%s_%s",s,"base_counts"); - } - } - - out.printf("%n"); - - } else { - logger.info("Per-Locus Depth of Coverage output was omitted"); - } - - for (DoCOutputType.Partition type : partitionTypes ) { - orderCheck.put(type,new ArrayList()); - for ( String id : getSamplesFromToolKit(type) ) { - orderCheck.get(type).add(id); - } - Collections.sort(orderCheck.get(type)); - } - } - - private HashSet getSamplesFromToolKit( Collection types ) { - HashSet partitions = new HashSet(); // since the DOCS object uses a HashMap, this will be in the same order - for (DoCOutputType.Partition t : types ) { - partitions.addAll(getSamplesFromToolKit(t)); - } - - return partitions; - } - - private HashSet getSamplesFromToolKit(DoCOutputType.Partition type) { - HashSet partition = new HashSet(); - if ( type == DoCOutputType.Partition.sample ) { - partition.addAll(SampleUtils.getSAMFileSamples(getToolkit())); - } else if ( type == DoCOutputType.Partition.readgroup ) { - for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { - partition.add(rg.getSample()+"_rg_"+rg.getReadGroupId()); - } - } else if ( type == DoCOutputType.Partition.library ) { - for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { - partition.add(rg.getLibrary()); - } - } else if ( type == DoCOutputType.Partition.center ) { - for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { - partition.add(rg.getSequencingCenter()); - } - } else if ( type == DoCOutputType.Partition.platform ) { - for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { - partition.add(rg.getPlatform()); - } - } else if ( type == DoCOutputType.Partition.sample_by_center ) { - for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { - partition.add(String.format("%s_cn_%s",rg.getSample(),rg.getSequencingCenter())); - } - } else if ( type == DoCOutputType.Partition.sample_by_platform ) { - for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { - partition.add(String.format("%s_pl_%s",rg.getSample(),rg.getPlatform())); - } - } else if ( type == DoCOutputType.Partition.sample_by_platform_by_center ) { - for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { - partition.add(String.format("%s_pl_%s_cn_%s",rg.getSample(),rg.getPlatform(),rg.getSequencingCenter())); - } - } else { - throw new ReviewedStingException("Invalid aggregation type sent to getSamplesFromToolKit"); - } - - return partition; - } - - public boolean isReduceByInterval() { - return ( ! omitIntervals ); - } - - public CoveragePartitioner reduceInit() { - CoveragePartitioner aggro = new CoveragePartitioner(partitionTypes,start,stop,nBins); - for (DoCOutputType.Partition t : partitionTypes ) { - aggro.addIdentifiers(t,getSamplesFromToolKit(t)); - } - aggro.initialize(includeDeletions,omitLocusTable); - checkOrder(aggro); - return aggro; - } - - public Map> map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (includeRefNBases || BaseUtils.isRegularBase(ref.getBase())) { - if ( ! omitDepthOutput ) { - getCorrectStream(null, DoCOutputType.Aggregation.locus, DoCOutputType.FileType.summary).printf("%s",ref.getLocus()); // yes: print locus in map, and the rest of the info in reduce (for eventual cumulatives) - //System.out.printf("\t[log]\t%s",ref.getLocus()); - } - - return CoverageUtils.getBaseCountsByPartition(context,minMappingQuality,maxMappingQuality,minBaseQuality,maxBaseQuality,countType,partitionTypes); - } else { - return null; - } - } - - public CoveragePartitioner reduce(Map> thisMap, CoveragePartitioner prevReduce) { - if ( thisMap != null ) { // skip sites we didn't want to include in the calculation (ref Ns) - if ( ! omitDepthOutput ) { - //checkOrder(prevReduce); // tests prevReduce.getIdentifiersByType().get(t) against the initialized header order - printDepths(getCorrectStream(null, DoCOutputType.Aggregation.locus, DoCOutputType.FileType.summary),thisMap,prevReduce.getIdentifiersByType()); - // this is an additional iteration through thisMap, plus dealing with IO, so should be much slower without - // turning on omit - } - - prevReduce.update(thisMap); // note that in "useBoth" cases, this method alters the thisMap object - } - - return prevReduce; - } - - public CoveragePartitioner treeReduce(CoveragePartitioner left, CoveragePartitioner right) { - left.merge(right); - return left; - } - - //////////////////////////////////////////////////////////////////////////////////// - // INTERVAL ON TRAVERSAL DONE - //////////////////////////////////////////////////////////////////////////////////// - - public void onTraversalDone( List> statsByInterval ) { - if ( refSeqGeneList != null && partitionTypes.contains(DoCOutputType.Partition.sample) ) { - printGeneStats(statsByInterval); - } - - if ( statsByInterval.size() > 0 ) { - for(DoCOutputType.Partition partition: partitionTypes) { - if ( checkType(statsByInterval.get(0).getSecond().getCoverageByAggregationType(partition) ,partition) ) { - printIntervalStats(statsByInterval, - getCorrectStream(partition, DoCOutputType.Aggregation.interval, DoCOutputType.FileType.summary), - getCorrectStream(partition, DoCOutputType.Aggregation.interval, DoCOutputType.FileType.statistics), - partition); - } else { - throw new ReviewedStingException("Partition type "+partition.toString()+" had no entries. Please check that your .bam header has all appropriate partition types."); - } - } - } else { - throw new UserException.CommandLineException("Cannot reduce by interval without interval list provided. Please provide a -L argument."); - } - - onTraversalDone(mergeAll(statsByInterval)); - - } - - public CoveragePartitioner mergeAll(List> stats) { - CoveragePartitioner first = stats.remove(0).second; - for ( Pair iStat : stats ) { - treeReduce(first,iStat.second); - } - - return first; - } - - private DepthOfCoverageStats printIntervalStats(List> statsByInterval, PrintStream summaryOut, PrintStream statsOut, DoCOutputType.Partition type) { - Pair firstPair = statsByInterval.get(0); - CoveragePartitioner firstAggregator = firstPair.second; - DepthOfCoverageStats firstStats = firstAggregator.getCoverageByAggregationType(type); - - StringBuilder summaryHeader = new StringBuilder(); - summaryHeader.append("Target"); - summaryHeader.append(separator); - summaryHeader.append("total_coverage"); - summaryHeader.append(separator); - summaryHeader.append("average_coverage"); - - for ( String s : firstStats.getAllSamples() ) { - summaryHeader.append(separator); - summaryHeader.append(s); - summaryHeader.append("_total_cvg"); - summaryHeader.append(separator); - summaryHeader.append(s); - summaryHeader.append("_mean_cvg"); - summaryHeader.append(separator); - summaryHeader.append(s); - summaryHeader.append("_granular_Q1"); - summaryHeader.append(separator); - summaryHeader.append(s); - summaryHeader.append("_granular_median"); - summaryHeader.append(separator); - summaryHeader.append(s); - summaryHeader.append("_granular_Q3"); - for ( int thresh : coverageThresholds ) { - summaryHeader.append(separator); - summaryHeader.append(s); - summaryHeader.append("_%_above_"); - summaryHeader.append(thresh); - } - } - - summaryOut.printf("%s%n",summaryHeader); - - int[][] nTargetsByAvgCvgBySample = new int[firstStats.getHistograms().size()][firstStats.getEndpoints().length+1]; - - for ( Pair targetAggregator : statsByInterval ) { - - Pair targetStats = new Pair( - targetAggregator.first, targetAggregator.second.getCoverageByAggregationType(type)); - printTargetSummary(summaryOut,targetStats); - updateTargetTable(nTargetsByAvgCvgBySample,targetStats.second); - } - - printIntervalTable(statsOut,nTargetsByAvgCvgBySample,firstStats.getEndpoints()); - - return firstStats; - } - - private void printGeneStats(List> statsByTarget) { - logger.debug("statsByTarget size is "+Integer.toString(statsByTarget.size())); - logger.debug("Initializing refseq..."); - LocationAwareSeekableRODIterator refseqIterator = initializeRefSeq(); - logger.debug("Refseq init done."); - List> statsByGene = new ArrayList>();// maintains order - Map geneNamesToStats = new HashMap(); // allows indirect updating of objects in list - - for ( Pair targetStats : statsByTarget ) { - String gene = getGeneName(targetStats.first,refseqIterator); - if ( geneNamesToStats.keySet().contains(gene) ) { - logger.debug("Merging "+geneNamesToStats.get(gene).toString()+" and "+targetStats.second.getCoverageByAggregationType(DoCOutputType.Partition.sample).toString()); - geneNamesToStats.get(gene).merge(targetStats.second.getCoverageByAggregationType(DoCOutputType.Partition.sample)); - } else { - DepthOfCoverageStats merger = new DepthOfCoverageStats(targetStats.second.getCoverageByAggregationType(DoCOutputType.Partition.sample)); - geneNamesToStats.put(gene,merger); - statsByGene.add(new Pair(gene,merger)); - } - } - - PrintStream geneSummaryOut = getCorrectStream(DoCOutputType.Partition.sample, DoCOutputType.Aggregation.gene, DoCOutputType.FileType.summary); - StringBuilder summaryHeader = new StringBuilder(); - summaryHeader.append("Gene"); - summaryHeader.append(separator); - summaryHeader.append("total_coverage"); - summaryHeader.append(separator); - summaryHeader.append("average_coverage"); - - for ( String s : statsByTarget.get(0).second.getCoverageByAggregationType(DoCOutputType.Partition.sample).getAllSamples() ) { - summaryHeader.append(separator); - summaryHeader.append(s); - summaryHeader.append("_total_cvg"); - summaryHeader.append(separator); - summaryHeader.append(s); - summaryHeader.append("_mean_cvg"); - summaryHeader.append(separator); - summaryHeader.append(s); - summaryHeader.append("_granular_Q1"); - summaryHeader.append(separator); - summaryHeader.append(s); - summaryHeader.append("_granular_median"); - summaryHeader.append(separator); - summaryHeader.append(s); - summaryHeader.append("_granular_Q3"); - for ( int thresh : coverageThresholds ) { - summaryHeader.append(separator); - summaryHeader.append(s); - summaryHeader.append("_%_above_"); - summaryHeader.append(thresh); - } - } - - geneSummaryOut.printf("%s%n",summaryHeader); - - for ( Pair geneStats : statsByGene ) { - printTargetSummary(geneSummaryOut,geneStats); - } - } - - //blatantly stolen from Andrew Kernytsky - private String getGeneName(GenomeLoc target, LocationAwareSeekableRODIterator refseqIterator) { - logger.debug("Examining "+target.toString()); - if (refseqIterator == null) { return "UNKNOWN"; } - - RODRecordList annotationList = refseqIterator.seekForward(target); - logger.debug("Annotation list is " + (annotationList == null ? "null" : annotationList.getName())); - if (annotationList == null) { return "UNKNOWN"; } - - for(GATKFeature rec : annotationList) { - if ( ((RefSeqFeature)rec.getUnderlyingObject()).overlapsExonP(target) ) { - logger.debug("We do overlap "+ rec.getUnderlyingObject().toString()); - return ((RefSeqFeature)rec.getUnderlyingObject()).getGeneName(); - } - logger.debug("No overlap"); - } - - return "UNKNOWN"; - - } - - private LocationAwareSeekableRODIterator initializeRefSeq() { - RMDTrackBuilder builder = new RMDTrackBuilder(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(), - getToolkit().getGenomeLocParser(), - getToolkit().getArguments().unsafe, - getToolkit().getArguments().disableAutoIndexCreationAndLockingWhenReadingRods); - RMDTrack refseq = builder.createInstanceOfTrack(RefSeqCodec.class,refSeqGeneList); - return new SeekableRODIterator(refseq.getHeader(),refseq.getSequenceDictionary(),getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(), - getToolkit().getGenomeLocParser(),refseq.getIterator()); - } - - private void printTargetSummary(PrintStream output, Pair intervalStats) { - DepthOfCoverageStats stats = intervalStats.second; - int[] bins = stats.getEndpoints(); - - StringBuilder targetSummary = new StringBuilder(); - targetSummary.append(intervalStats.first.toString()); - targetSummary.append(separator); - targetSummary.append(stats.getTotalCoverage()); - targetSummary.append(separator); - targetSummary.append(String.format("%.2f",stats.getTotalMeanCoverage())); - - for ( String s : stats.getAllSamples() ) { - targetSummary.append(separator); - targetSummary.append(stats.getTotals().get(s)); - targetSummary.append(separator); - targetSummary.append(String.format("%.2f", stats.getMeans().get(s))); - targetSummary.append(separator); - int median = getQuantile(stats.getHistograms().get(s),0.5); - int q1 = getQuantile(stats.getHistograms().get(s),0.25); - int q3 = getQuantile(stats.getHistograms().get(s),0.75); - targetSummary.append(formatBin(bins,q1)); - targetSummary.append(separator); - targetSummary.append(formatBin(bins,median)); - targetSummary.append(separator); - targetSummary.append(formatBin(bins,q3)); - for ( int thresh : coverageThresholds ) { - targetSummary.append(String.format("%s%.1f",separator,getPctBasesAbove(stats.getHistograms().get(s),stats.value2bin(thresh)))); - } - - } - - output.printf("%s%n", targetSummary); - } - - private String formatBin(int[] bins, int quartile) { - if ( quartile >= bins.length ) { - return String.format(">%d",bins[bins.length-1]); - } else if ( quartile < 0 ) { - return String.format("<%d",bins[0]); - } else { - return String.format("%d",bins[quartile]); - } - } - - private void printIntervalTable(PrintStream output, int[][] intervalTable, int[] cutoffs) { - String colHeader = outputFormat.equals("rtable") ? "" : "Number_of_sources"; - output.printf(colHeader + separator+"depth>=%d",0); - for ( int col = 0; col < intervalTable[0].length-1; col ++ ) { - output.printf(separator+"depth>=%d",cutoffs[col]); - } - - output.printf(String.format("%n")); - for ( int row = 0; row < intervalTable.length; row ++ ) { - output.printf("At_least_%d_samples",row+1); - for ( int col = 0; col < intervalTable[0].length; col++ ) { - output.printf(separator+"%d",intervalTable[row][col]); - } - output.printf(String.format("%n")); - } - } - - /* - * @updateTargetTable - * The idea is to have counts for how many *targets* have at least K samples with - * median coverage of at least X. - * To that end: - * Iterate over the samples the DOCS object, determine how many there are with - * median coverage > leftEnds[0]; how many with median coverage > leftEnds[1] - * and so on. Then this target has at least N, N-1, N-2, ... 1, 0 samples covered - * to leftEnds[0] and at least M,M-1,M-2,...1,0 samples covered to leftEnds[1] - * and so on. - */ - private void updateTargetTable(int[][] table, DepthOfCoverageStats stats) { - int[] cutoffs = stats.getEndpoints(); - int[] countsOfMediansAboveCutoffs = new int[cutoffs.length+1]; // 0 bin to catch everything - for ( int i = 0; i < countsOfMediansAboveCutoffs.length; i ++) { - countsOfMediansAboveCutoffs[i]=0; - } - - for ( String s : stats.getAllSamples() ) { - int medianBin = getQuantile(stats.getHistograms().get(s),0.5); - for ( int i = 0; i <= medianBin; i ++) { - countsOfMediansAboveCutoffs[i]++; - } - } - - for ( int medianBin = 0; medianBin < countsOfMediansAboveCutoffs.length; medianBin++) { - for ( ; countsOfMediansAboveCutoffs[medianBin] > 0; countsOfMediansAboveCutoffs[medianBin]-- ) { - table[countsOfMediansAboveCutoffs[medianBin]-1][medianBin]++; - // the -1 is due to counts being 1-based and offsets being 0-based - } - } - } - - //////////////////////////////////////////////////////////////////////////////////// - // FINAL ON TRAVERSAL DONE - //////////////////////////////////////////////////////////////////////////////////// - - public void onTraversalDone(CoveragePartitioner coverageProfiles) { - /////////////////// - // OPTIONAL OUTPUTS - ////////////////// - - if ( ! omitSampleSummary ) { - logger.info("Printing summary info"); - for (DoCOutputType.Partition type : partitionTypes ) { - outputSummaryFiles(coverageProfiles,type); - } - } - - if ( ! omitLocusTable ) { - logger.info("Printing locus summary"); - for (DoCOutputType.Partition type : partitionTypes ) { - outputLocusFiles(coverageProfiles,type); - } - } - } - - private void outputLocusFiles(CoveragePartitioner coverageProfiles, DoCOutputType.Partition type ) { - printPerLocus(getCorrectStream(type, DoCOutputType.Aggregation.cumulative, DoCOutputType.FileType.coverage_counts), - getCorrectStream(type, DoCOutputType.Aggregation.cumulative, DoCOutputType.FileType.coverage_proportions), - coverageProfiles.getCoverageByAggregationType(type),type); - } - - private void outputSummaryFiles(CoveragePartitioner coverageProfiles, DoCOutputType.Partition type ) { - printPerSample(getCorrectStream(type, DoCOutputType.Aggregation.cumulative, DoCOutputType.FileType.statistics),coverageProfiles.getCoverageByAggregationType(type)); - printSummary(getCorrectStream(type, DoCOutputType.Aggregation.cumulative, DoCOutputType.FileType.summary),coverageProfiles.getCoverageByAggregationType(type)); - } - - //////////////////////////////////////////////////////////////////////////////////// - // HELPER OUTPUT METHODS - //////////////////////////////////////////////////////////////////////////////////// - - private void printPerSample(PrintStream output,DepthOfCoverageStats stats) { - int[] leftEnds = stats.getEndpoints(); - - StringBuilder hBuilder = new StringBuilder(); - if ( ! outputFormat.equals("rTable")) { - hBuilder.append("Source_of_reads"); - } - hBuilder.append(separator); - hBuilder.append(String.format("from_0_to_%d)%s",leftEnds[0],separator)); - for ( int i = 1; i < leftEnds.length; i++ ) - hBuilder.append(String.format("from_%d_to_%d)%s",leftEnds[i-1],leftEnds[i],separator)); - hBuilder.append(String.format("from_%d_to_inf%n",leftEnds[leftEnds.length-1])); - output.print(hBuilder.toString()); - Map histograms = stats.getHistograms(); - - for ( Map.Entry p : histograms.entrySet() ) { - StringBuilder sBuilder = new StringBuilder(); - sBuilder.append(String.format("sample_%s",p.getKey())); - for ( long count : p.getValue() ) { - sBuilder.append(String.format("%s%d",separator,count)); - } - sBuilder.append(String.format("%n")); - output.print(sBuilder.toString()); - } - } - - private void printPerLocus(PrintStream output, PrintStream coverageOut, DepthOfCoverageStats stats, DoCOutputType.Partition partitionType) { - int[] endpoints = stats.getEndpoints(); - int samples = stats.getHistograms().size(); - - long[][] baseCoverageCumDist = stats.getLocusCounts(); - - // rows - # of samples - // columns - depth of coverage - - boolean printSampleColumnHeader = outputFormat.equals("csv") || outputFormat.equals("table"); - - StringBuilder header = new StringBuilder(); - if ( printSampleColumnHeader ) { - // mhanna 22 Aug 2010 - Deliberately force this header replacement to make sure integration tests pass. - // TODO: Update integration tests and get rid of this. - header.append(partitionType == DoCOutputType.Partition.readgroup ? "read_group" : partitionType.toString()); - } - header.append(String.format("%sgte_0",separator)); - for ( int d : endpoints ) { - header.append(String.format("%sgte_%d",separator,d)); - } - header.append(String.format("%n")); - - output.print(header); - coverageOut.print(header); - - for ( int row = 0; row < samples; row ++ ) { - output.printf("%s_%d","NSamples",row+1); - for ( int depthBin = 0; depthBin < baseCoverageCumDist[0].length; depthBin ++ ) { - output.printf("%s%d",separator,baseCoverageCumDist[row][depthBin]); - } - output.printf("%n"); - } - - for ( String sample : stats.getAllSamples() ) { - coverageOut.printf("%s",sample); - double[] coverageDistribution = stats.getCoverageProportions(sample); - for ( int bin = 0; bin < coverageDistribution.length; bin ++ ) { - coverageOut.printf("%s%.2f",separator,coverageDistribution[bin]); - } - coverageOut.printf("%n"); - } - } - - private PrintStream getCorrectStream(DoCOutputType.Partition partition, DoCOutputType.Aggregation aggregation, DoCOutputType.FileType fileType) { - DoCOutputType outputType = new DoCOutputType(partition,aggregation,fileType); - if(!out.containsKey(outputType)) - throw new UserException.CommandLineException(String.format("Unable to find appropriate stream for partition = %s, aggregation = %s, file type = %s",partition,aggregation,fileType)); - return out.get(outputType); - } - - private void printSummary(PrintStream output, DepthOfCoverageStats stats) { - if ( ! outputFormat.equals("csv") ) { - output.printf("%s\t%s\t%s\t%s\t%s\t%s","sample_id","total","mean","granular_third_quartile","granular_median","granular_first_quartile"); - } else { - output.printf("%s,%s,%s,%s,%s,%s","sample_id","total","mean","granular_third_quartile","granular_median","granular_first_quartile"); - } - - for ( int thresh : coverageThresholds ) { - output.printf("%s%s%d",separator,"%_bases_above_",thresh); - } - - output.printf("%n"); - - Map histograms = stats.getHistograms(); - Map means = stats.getMeans(); - Map totals = stats.getTotals(); - int[] leftEnds = stats.getEndpoints(); - - for ( Map.Entry p : histograms.entrySet() ) { - String s = p.getKey(); - long[] histogram = p.getValue(); - int median = getQuantile(histogram,0.5); - int q1 = getQuantile(histogram,0.25); - int q3 = getQuantile(histogram,0.75); - // if any of these are larger than the higest bin, put the median as in the largest bin - median = median == histogram.length-1 ? histogram.length-2 : median; - q1 = q1 == histogram.length-1 ? histogram.length-2 : q1; - q3 = q3 == histogram.length-1 ? histogram.length-2 : q3; - if ( ! outputFormat.equals("csv") ) { - output.printf("%s\t%d\t%.2f\t%d\t%d\t%d",s,totals.get(s),means.get(s),leftEnds[q3],leftEnds[median],leftEnds[q1]); - } else { - output.printf("%s,%d,%.2f,%d,%d,%d",s,totals.get(s),means.get(s),leftEnds[q3],leftEnds[median],leftEnds[q1]); - } - - for ( int thresh : coverageThresholds ) { - output.printf("%s%.1f",separator,getPctBasesAbove(histogram,stats.value2bin(thresh))); - } - - output.printf("%n"); - } - - if ( ! outputFormat.equals("csv") ) { - output.printf("%s\t%d\t%.2f\t%s\t%s\t%s%n","Total",stats.getTotalCoverage(),stats.getTotalMeanCoverage(),"N/A","N/A","N/A"); - } else { - output.printf("%s,%d,%.2f,%s,%s,%s%n","Total",stats.getTotalCoverage(),stats.getTotalMeanCoverage(),"N/A","N/A","N/A"); - } - } - - private int getQuantile(long[] histogram, double prop) { - int total = 0; - - for ( int i = 0; i < histogram.length; i ++ ) { - total += histogram[i]; - } - - int counts = 0; - int bin = -1; - while ( counts < prop*total ) { - counts += histogram[bin+1]; - bin++; - } - - return bin == -1 ? 0 : bin; - } - - private double getPctBasesAbove(long[] histogram, int bin) { - long below = 0l; - long above = 0l; - for ( int index = 0; index < histogram.length; index++) { - if ( index < bin ) { - below+=histogram[index]; - } else { - above+=histogram[index]; - } - } - - return 100*( (double) above )/( above + below ); - } - - private void printDepths(PrintStream stream, Map> countsBySampleByType, Map> identifiersByType) { - // get the depths per sample and build up the output string while tabulating total and average coverage - StringBuilder perSampleOutput = new StringBuilder(); - int tDepth = 0; - boolean depthCounted = false; - for (DoCOutputType.Partition type : partitionTypes ) { - Map countsByID = countsBySampleByType.get(type); - for ( String s : identifiersByType.get(type) ) { - perSampleOutput.append(separator); - long dp = (countsByID != null && countsByID.keySet().contains(s)) ? sumArray(countsByID.get(s)) : 0 ; - perSampleOutput.append(dp); - if ( printBaseCounts ) { - perSampleOutput.append(separator); - perSampleOutput.append(baseCounts(countsByID != null ? countsByID.get(s) : null )); - } - if ( ! depthCounted ) { - tDepth += dp; - } - } - depthCounted = true; // only sum the total depth once - } - - // remember -- genome locus was printed in map() - stream.printf("%s%d",separator,tDepth); - for (DoCOutputType.Partition type : partitionTypes ) { - stream.printf("%s%.2f",separator, ( (double) tDepth / identifiersByType.get(type).size() ) ); - } - stream.printf("%s%n",perSampleOutput); - } - - private long sumArray(int[] array) { - long i = 0; - for ( int j : array ) { - i += j; - } - return i; - } - - private String baseCounts(int[] counts) { - if ( counts == null ) { - counts = new int[6]; - } - StringBuilder s = new StringBuilder(); - int nbases = 0; - for ( byte b : BaseUtils.EXTENDED_BASES ) { - nbases++; - if ( includeDeletions || b != BaseUtils.Base.D.base ) { - s.append((char)b); - s.append(":"); - s.append(counts[BaseUtils.extendedBaseToBaseIndex(b)]); - if ( nbases < 6 ) { - s.append(" "); - } - } - } - - return s.toString(); - } - - private void checkOrder(CoveragePartitioner ag) { - // make sure the ordering stored at initialize() is propagated along reduce - for (DoCOutputType.Partition t : partitionTypes ) { - List order = orderCheck.get(t); - List namesInAg = ag.getIdentifiersByType().get(t); - - // todo -- chris check me - Set namesInDOCS = ag.getCoverageByAggregationType(t).getAllSamples(); - int index = 0; - for ( String s : namesInAg ) { - if ( ! s.equalsIgnoreCase(order.get(index)) ) { - throw new ReviewedStingException("IDs are out of order for type "+t+"! Aggregator has different ordering"); - } - index++; - } - } - } - - public boolean checkType(DepthOfCoverageStats stats, DoCOutputType.Partition type ) { - if ( stats.getHistograms().size() < 1 ) { - logger.warn("The histogram per partition type "+type.toString()+" was empty\n"+ - "Do your read groups have this type? (Check your .bam header)."); - return false; - } else { - return true; - } - } - -} - -class DoCOutputMultiplexer implements Multiplexer { - private final Set partitions; - private final File refSeqGeneList; - private final boolean omitDepthOutput; - private final boolean omitIntervals; - private final boolean omitSampleSummary; - private final boolean omitLocusTable; - - /** - * Create a new multiplexer type using the values of all variable fields. - * @param partitions - * @param refSeqGeneList - * @param omitDepthOutput - * @param omitIntervals - * @param omitSampleSummary - * @param omitLocusTable - */ - public DoCOutputMultiplexer(final Set partitions, - final File refSeqGeneList, - final boolean omitDepthOutput, - final boolean omitIntervals, - final boolean omitSampleSummary, - final boolean omitLocusTable) { - this.partitions = partitions; - this.refSeqGeneList = refSeqGeneList; - this.omitDepthOutput = omitDepthOutput; - this.omitIntervals = omitIntervals; - this.omitSampleSummary = omitSampleSummary; - this.omitLocusTable = omitLocusTable; - } - - public Collection multiplex() { - List outputs = new ArrayList(); - if(!omitDepthOutput) outputs.add(new DoCOutputType(null, DoCOutputType.Aggregation.locus, DoCOutputType.FileType.summary)); - - if(!omitIntervals) { - for(DoCOutputType.Partition partition: partitions) { - outputs.add(new DoCOutputType(partition, DoCOutputType.Aggregation.interval, DoCOutputType.FileType.summary)); - outputs.add(new DoCOutputType(partition, DoCOutputType.Aggregation.interval, DoCOutputType.FileType.statistics)); - } - } - - if(refSeqGeneList != null && partitions.contains(DoCOutputType.Partition.sample)) { - DoCOutputType geneSummaryOut = new DoCOutputType(DoCOutputType.Partition.sample, DoCOutputType.Aggregation.gene, DoCOutputType.FileType.summary); - outputs.add(geneSummaryOut); - } - - if(!omitSampleSummary) { - for(DoCOutputType.Partition partition: partitions) { - outputs.add(new DoCOutputType(partition, DoCOutputType.Aggregation.cumulative, DoCOutputType.FileType.summary)); - outputs.add(new DoCOutputType(partition, DoCOutputType.Aggregation.cumulative, DoCOutputType.FileType.statistics)); - } - } - - if(!omitLocusTable) { - for(DoCOutputType.Partition partition: partitions) { - outputs.add(new DoCOutputType(partition, DoCOutputType.Aggregation.cumulative, DoCOutputType.FileType.coverage_counts)); - outputs.add(new DoCOutputType(partition, DoCOutputType.Aggregation.cumulative, DoCOutputType.FileType.coverage_proportions)); - } - } - - return outputs; - } - - public String transformArgument(final DoCOutputType outputType, final String argument) { - return outputType.getFileName(argument); - } - -} - -class CoveragePartitioner { - private Collection types; - private Map coverageProfiles; - private Map> identifiersByType; - private Set allIdentifiers; - public CoveragePartitioner(Collection typesToUse, int start, int stop, int nBins) { - coverageProfiles = new HashMap(); - identifiersByType = new HashMap>(); - types = typesToUse; - for ( DoCOutputType.Partition type : types ) { - coverageProfiles.put(type,new DepthOfCoverageStats(DepthOfCoverageStats.calculateBinEndpoints(start,stop,nBins))); - identifiersByType.put(type,new ArrayList()); - } - allIdentifiers = new HashSet(); - } - - public void merge(CoveragePartitioner otherAggregator) { - for ( DoCOutputType.Partition type : types ) { - this.coverageProfiles.get(type).merge(otherAggregator.coverageProfiles.get(type)); - } - } - - public DepthOfCoverageStats getCoverageByAggregationType(DoCOutputType.Partition t) { - return coverageProfiles.get(t); - } - - public void addIdentifiers(DoCOutputType.Partition t, Set ids) { - for ( String s : ids ) { - coverageProfiles.get(t).addSample(s); - identifiersByType.get(t).add(s); - allIdentifiers.add(s); - } - Collections.sort(identifiersByType.get(t)); - } - - public void initialize(boolean useDels, boolean omitLocusTable) { - for ( DoCOutputType.Partition t : types ) { - if ( useDels ) { - coverageProfiles.get(t).initializeDeletions(); - } - if ( ! omitLocusTable ) { - coverageProfiles.get(t).initializeLocusCounts(); - } - } - } - - public void update(Map> countsByIdentifierByType) { - for ( DoCOutputType.Partition t : types ) { - coverageProfiles.get(t).update(countsByIdentifierByType.get(t)); - } - } - - public Set getAllIdentifiers() { - return allIdentifiers; - } - - public Map> getIdentifiersByType() { - return identifiersByType; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CheckPileup.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CheckPileup.java deleted file mode 100644 index 533c7be73..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CheckPileup.java +++ /dev/null @@ -1,169 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.walkers.qc; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.commandline.RodBinding; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.sting.gatk.walkers.TreeReducible; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.codecs.sampileup.SAMPileupFeature; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; - -import java.io.PrintStream; -import java.util.Arrays; - -/** - * At every locus in the input set, compares the pileup data (reference base, aligned base from - * each overlapping read, and quality score) to the reference pileup data generated by samtools. Samtools' pileup data - * should be specified using the command-line argument '-pileup:SAMPileup '. - */ -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) -@Requires(value={DataSource.READS,DataSource.REFERENCE}) -public class CheckPileup extends LocusWalker implements TreeReducible { - @Input(fullName = "pileup", doc="The SAMPileup containing the expected output", required = true) - RodBinding pileup; - - @Output - private PrintStream out; - - @Argument(fullName="continue_after_error",doc="Continue after an error",required=false) - public boolean CONTINUE_AFTER_AN_ERROR = false; - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - ReadBackedPileup pileup = context.getBasePileup(); - SAMPileupFeature truePileup = getTruePileup( tracker ); - - if ( truePileup == null ) { - out.printf("No truth pileup data available at %s%n", pileup.getPileupString(ref.getBaseAsChar())); - if ( ! CONTINUE_AFTER_AN_ERROR ) { - throw new UserException.CommandLineException(String.format("No pileup data available at %s given GATK's output of %s -- this walker requires samtools pileup data over all bases", - context.getLocation(), new String(pileup.getBases()))); - } - } else { - String pileupDiff = pileupDiff(pileup, truePileup, true); - if ( pileupDiff != null ) { - out.printf("%s vs. %s%n", pileup.getPileupString(ref.getBaseAsChar()), truePileup.getPileupString()); - if ( ! CONTINUE_AFTER_AN_ERROR ) { - throw new RuntimeException(String.format("Pileups aren't equal: %s", pileupDiff)); - } - } - } - - return pileup.getNumberOfElements(); - } - - private static String maybeSorted( final String x, boolean sortMe ) - { - if ( sortMe ) { - byte[] bytes = x.getBytes(); - Arrays.sort(bytes); - return new String(bytes); - } - else - return x; - } - - public String pileupDiff(final ReadBackedPileup a, final SAMPileupFeature b, boolean orderDependent) - { - if ( a.getNumberOfElements() != b.size() ) - return "Sizes not equal"; - GenomeLoc featureLocation = getToolkit().getGenomeLocParser().createGenomeLoc(b.getChr(),b.getStart(),b.getEnd()); - if ( a.getLocation().compareTo(featureLocation) != 0 ) - return "Locations not equal"; - - String aBases = maybeSorted(new String(a.getBases()), ! orderDependent ); - String bBases = maybeSorted(b.getBasesAsString(), ! orderDependent ); - if ( ! aBases.toUpperCase().equals(bBases.toUpperCase()) ) - return "Bases not equal"; - - String aQuals = maybeSorted(new String(a.getQuals()), ! orderDependent ); - String bQuals = maybeSorted(new String(b.getQuals()), ! orderDependent ); - if ( ! aQuals.equals(bQuals) ) - return "Quals not equal"; - - return null; - } - - // Given result of map function - public CheckPileupStats reduceInit() { return new CheckPileupStats(); } - public CheckPileupStats reduce(Integer value, CheckPileupStats sum) { - sum.nLoci++; - sum.nBases += value; - return sum; - } - - public CheckPileupStats treeReduce( CheckPileupStats lhs, CheckPileupStats rhs ) { - CheckPileupStats combined = new CheckPileupStats(); - combined.nLoci = lhs.nLoci + rhs.nLoci; - combined.nBases = lhs.nBases + rhs.nBases; - return combined; - } - - /** - * Extracts the true pileup data from the given rodSAMPileup. Note that this implementation - * assumes that the genotype will only be point or indel. - * @param tracker ROD tracker from which to extract pileup data. - * @return True pileup data. - */ - private SAMPileupFeature getTruePileup( RefMetaDataTracker tracker ) { - SAMPileupFeature pileupArg = tracker.getFirstValue(pileup); - - if( pileupArg == null) - return null; - - if( pileupArg.hasPointGenotype() ) - return pileupArg.getPointGenotype(); - else if( pileupArg.hasIndelGenotype() ) - return pileupArg.getIndelGenotype(); - else - throw new ReviewedStingException("Unsupported pileup type: " + pileupArg); - } -} - -class CheckPileupStats { - public long nLoci = 0; - public long nBases = 0; - - public CheckPileupStats() { - } - - public String toString() { - return String.format("Validated %d sites covered by %d bases%n", nLoci, nBases); - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ErrorThrowing.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ErrorThrowing.java deleted file mode 100644 index 7ec93e582..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ErrorThrowing.java +++ /dev/null @@ -1,112 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.walkers.qc; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Hidden; -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.NanoSchedulable; -import org.broadinstitute.sting.gatk.walkers.RefWalker; -import org.broadinstitute.sting.gatk.walkers.TreeReducible; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.help.HelpConstants; - -/** - * a walker that simply throws errors. Allows us to test that the engine is behaving as expected with error handling - */ -@Hidden -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_TEST, extraDocs = {CommandLineGATK.class} ) -public class ErrorThrowing extends RefWalker implements TreeReducible, NanoSchedulable { - @Input(fullName="exception", shortName = "E", doc="Java class of exception to throw", required=true) - public String exceptionToThrow; - - @Argument(fullName = "failMethod", shortName = "fail", doc = "Determines which method to fail in", required = false) - public FailMethod failMethod = FailMethod.MAP; - - public enum FailMethod { - MAP, - REDUCE, - TREE_REDUCE - } - - // - // Template code to allow us to build the walker, doesn't actually do anything - // - @Override - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( ref == null ) // only throw exception when we are in proper map, not special map(null) call - return null; - - if ( failMethod == FailMethod.MAP ) - fail(); - - return 0; - } - - @Override - public Integer reduceInit() { - return 0; - } - - @Override - public Integer reduce(Integer value, Integer sum) { - if ( value != null && failMethod == FailMethod.REDUCE ) - fail(); - return sum; - } - - public Integer treeReduce(final Integer lhs, final Integer rhs) { - if ( failMethod == FailMethod.TREE_REDUCE ) - fail(); - return rhs; - } - - private void fail() { - if ( exceptionToThrow.equals("UserException") ) { - throw new UserException("UserException"); - } else if ( exceptionToThrow.equals("NullPointerException") ) { - throw new NullPointerException(); - } else if ( exceptionToThrow.equals("ReviewedStingException") ) { - throw new ReviewedStingException("ReviewedStingException"); - } else if ( exceptionToThrow.equals("SamError1") ) { - throw new RuntimeException(CommandLineGATK.PICARD_TEXT_SAM_FILE_ERROR_1); - } else if ( exceptionToThrow.equals("SamError2") ) { - throw new RuntimeException(CommandLineGATK.PICARD_TEXT_SAM_FILE_ERROR_2); - } else if ( exceptionToThrow.equals("NoSpace1") ) { - throw new net.sf.samtools.util.RuntimeIOException(new java.io.IOException("No space left on device java.io.FileOutputStream.writeBytes(Native Method)")); - } else if ( exceptionToThrow.equals("NoSpace2") ) { - throw new net.sf.samtools.SAMException("Exception writing BAM index file", new java.io.IOException("No space left on device java.io.FileOutputStream.writeBytes(Native Method)")); - } else { - throw new UserException.BadArgumentValue("exception", "exception isn't a recognized value " + exceptionToThrow); - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/Pileup.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/Pileup.java deleted file mode 100644 index 23bbf1460..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/Pileup.java +++ /dev/null @@ -1,189 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.walkers.qc; - -import org.broad.tribble.Feature; -import org.broadinstitute.sting.commandline.*; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.NanoSchedulable; -import org.broadinstitute.sting.gatk.walkers.TreeReducible; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; - -import java.io.PrintStream; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - -/** - * Emulates the samtools pileup command to print aligned reads - * - *

Prints the alignment in something similar to the samtools pileup format. Each line represents a genomic position, - * consisting of chromosome name, coordinate, reference base, read bases, and read qualities. - * - * Emulated command: - * samtools pileup [-f in.ref.fasta] [-t in.ref_list] [-l in.site_list] [-iscg] [-T theta] [-N nHap] [-r pairDiffRate] - * - *

Input

- *

- * A BAM file and the interval to print. - *

- * - *

Output

- *

- * Formatted pileup-style alignment of reads. - *

- * - *

Example

- *
- * java -Xmx2g -jar GenomeAnalysisTK.jar \
- *   -T Pileup \
- *   -R ref.fasta \
- *   -I aligned_reads.bam \
- *   -o output.txt
- * 
- * - */ -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) -public class Pileup extends LocusWalker implements TreeReducible, NanoSchedulable { - - private static final String verboseDelimiter = "@"; // it's ugly to use "@" but it's literally the only usable character not allowed in read names - - @Output - PrintStream out; - - /** - * In addition to the standard pileup output, adds 'verbose' output too. The verbose output contains the number of spanning deletions, - * and for each read in the pileup it has the read name, offset in the base string, read length, and read mapping quality. These per - * read items are delimited with an '@' character. - */ - @Argument(fullName="showVerbose",shortName="verbose",doc="Add an extra verbose section to the pileup output", required=false) - public boolean SHOW_VERBOSE = false; - - @Input(fullName="metadata",shortName="metadata",doc="Add these ROD bindings to the output Pileup", required=false) - public List> rods = Collections.emptyList(); - - @Hidden - @Argument(fullName="outputInsertLength",shortName = "outputInsertLength",doc="Add a column which contains the length of the insert each base comes from.",required=false) - public boolean outputInsertLength=false; - - @Override - public String map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - final String rods = getReferenceOrderedData( tracker ); - - ReadBackedPileup basePileup = context.getBasePileup(); - - final StringBuilder s = new StringBuilder(); - s.append(String.format("%s %s", basePileup.getPileupString((char)ref.getBase()), rods)); - if ( outputInsertLength ) - s.append(" ").append(insertLengthOutput(basePileup)); - if ( SHOW_VERBOSE ) - s.append(" ").append(createVerboseOutput(basePileup)); - s.append("\n"); - - return s.toString(); - } - - // Given result of map function - @Override - public Integer reduceInit() { return 0; } - - @Override - public Integer reduce(String value, Integer sum) { - out.print(value); - return sum + 1; - } - - @Override - public Integer treeReduce(Integer lhs, Integer rhs) { - return lhs + rhs; - } - - /** - * Get a string representation the reference-ordered data. - * @param tracker Container for the reference-ordered data. - * @return String representation of the reference-ordered data. - */ - private String getReferenceOrderedData( RefMetaDataTracker tracker ) { - ArrayList rodStrings = new ArrayList(); - for ( Feature datum : tracker.getValues(rods) ) { - rodStrings.add(datum.toString()); - } - String rodString = Utils.join(", ", rodStrings); - - if ( !rodString.equals("") ) - rodString = "[ROD: " + rodString + "]"; - - return rodString; - } - private static String insertLengthOutput(final ReadBackedPileup pileup) { - - Integer[] insertSizes=new Integer[pileup.depthOfCoverage()]; - - int i=0; - for ( PileupElement p : pileup ) { - insertSizes[i]=p.getRead().getInferredInsertSize(); - ++i; - } - return Utils.join(",",insertSizes); - } - - - private static String createVerboseOutput(final ReadBackedPileup pileup) { - final StringBuilder sb = new StringBuilder(); - boolean isFirst = true; - - sb.append(pileup.getNumberOfDeletions()); - sb.append(" "); - - for ( PileupElement p : pileup ) { - if ( isFirst ) - isFirst = false; - else - sb.append(","); - sb.append(p.getRead().getReadName()); - sb.append(verboseDelimiter); - sb.append(p.getOffset()); - sb.append(verboseDelimiter); - sb.append(p.getRead().getReadLength()); - sb.append(verboseDelimiter); - sb.append(p.getRead().getMappingQuality()); - } - return sb.toString(); - } - - @Override - public void onTraversalDone(Integer result) { - out.println("[REDUCE RESULT] Traversal result is: " + result); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java deleted file mode 100644 index e13252d49..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ /dev/null @@ -1,352 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.walkers.variantutils; - -import org.broadinstitute.sting.commandline.*; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.Reference; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.gatk.walkers.TreeReducible; -import org.broadinstitute.sting.gatk.walkers.Window; -import org.broadinstitute.sting.gatk.walkers.annotator.ChromosomeCountConstants; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.sting.utils.variant.GATKVCFUtils; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.vcf.*; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.variantcontext.VariantContextBuilder; -import org.broadinstitute.variant.variantcontext.VariantContextUtils; -import org.broadinstitute.variant.variantcontext.writer.Options; -import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; - -import java.util.*; - -/** - * Combines VCF records from different sources. - * - *

- * CombineVariants combines VCF records from different sources. Any (unique) name can be used to bind your rod data - * and any number of sources can be input. This tool currently supports two different combination types for each of - * variants (the first 8 fields of the VCF) and genotypes (the rest). - * Merge: combines multiple records into a single one; if sample names overlap then they are uniquified. - * Union: assumes each rod represents the same set of samples (although this is not enforced); using the - * priority list (if provided), it emits a single record instance at every position represented in the rods. - * - * CombineVariants will include a record at every site in all of your input VCF files, and annotate which input ROD - * bindings the record is present, pass, or filtered in in the set attribute in the INFO field. In effect, - * CombineVariants always produces a union of the input VCFs. However, any part of the Venn of the N merged VCFs - * can be exacted using JEXL expressions on the set attribute using SelectVariants. If you want to extract just - * the records in common between two VCFs, you would first run CombineVariants on the two files to generate a single - * VCF and then run SelectVariants to extract the common records with -select 'set == "Intersection"', as worked out - * in the detailed example in the documentation guide. - * - * Note that CombineVariants supports multi-threaded parallelism (8/15/12). This is particularly useful - * when converting from VCF to BCF2, which can be expensive. In this case each thread spends CPU time - * doing the conversion, and the GATK engine is smart enough to merge the partial BCF2 blocks together - * efficiency. However, since this merge runs in only one thread, you can quickly reach diminishing - * returns with the number of parallel threads. -nt 4 works well but -nt 8 may be too much. - * - * Some fine details about the merging algorithm: - *

    - *
  • As of GATK 2.1, when merging multiple VCF records at a site, the combined VCF record has the QUAL of - * the first VCF record with a non-MISSING QUAL value. The previous behavior was to take the - * max QUAL, which resulted in sometime strange downstream confusion
  • - *
- * - *

Input

- *

- * One or more variant sets to combine. - *

- * - *

Output

- *

- * A combined VCF. - *

- * - *

Examples

- *
- * java -Xmx2g -jar GenomeAnalysisTK.jar \
- *   -R ref.fasta \
- *   -T CombineVariants \
- *   --variant input1.vcf \
- *   --variant input2.vcf \
- *   -o output.vcf \
- *   -genotypeMergeOptions UNIQUIFY
- *
- * java -Xmx2g -jar GenomeAnalysisTK.jar \
- *   -R ref.fasta \
- *   -T CombineVariants \
- *   --variant:foo input1.vcf \
- *   --variant:bar input2.vcf \
- *   -o output.vcf \
- *   -genotypeMergeOptions PRIORITIZE
- *   -priority foo,bar
- * 
- * - */ -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) -@Reference(window=@Window(start=-50,stop=50)) -public class CombineVariants extends RodWalker implements TreeReducible { - /** - * The VCF files to merge together - * - * variants can take any number of arguments on the command line. Each -V argument - * will be included in the final merged output VCF. If no explicit name is provided, - * the -V arguments will be named using the default algorithm: variants, variants2, variants3, etc. - * The user can override this by providing an explicit name -V:name,vcf for each -V argument, - * and each named argument will be labeled as such in the output (i.e., set=name rather than - * set=variants2). The order of arguments does not matter unless except for the naming, so - * if you provide an rod priority list and no explicit names than variants, variants2, etc - * are technically order dependent. It is strongly recommended to provide explicit names when - * a rod priority list is provided. - */ - @Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true) - public List> variants; - - @Output(doc="File to which variants should be written") - protected VariantContextWriter vcfWriter = null; - - @Argument(shortName="genotypeMergeOptions", doc="Determines how we should merge genotype records for samples shared across the ROD files", required=false) - public GATKVariantContextUtils.GenotypeMergeType genotypeMergeOption = null; - - @Argument(shortName="filteredRecordsMergeType", doc="Determines how we should handle records seen at the same site in the VCF, but with different FILTER fields", required=false) - public GATKVariantContextUtils.FilteredRecordMergeType filteredRecordsMergeType = GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED; - - @Hidden - @Argument(shortName="multipleAllelesMergeType", doc="Determines how we should handle records seen at the same site in the VCF, but with different allele types (for example, SNP vs. indel)", required=false) - public GATKVariantContextUtils.MultipleAllelesMergeType multipleAllelesMergeType = GATKVariantContextUtils.MultipleAllelesMergeType.BY_TYPE; - - /** - * Used when taking the union of variants that contain genotypes. A complete priority list MUST be provided. - */ - @Argument(fullName="rod_priority_list", shortName="priority", doc="A comma-separated string describing the priority ordering for the genotypes as far as which record gets emitted", required=false) - public String PRIORITY_STRING = null; - - @Argument(fullName="printComplexMerges", shortName="printComplexMerges", doc="Print out interesting sites requiring complex compatibility merging", required=false) - public boolean printComplexMerges = false; - - @Argument(fullName="filteredAreUncalled", shortName="filteredAreUncalled", doc="If true, then filtered VCFs are treated as uncalled, so that filtered set annotations don't appear in the combined VCF", required=false) - public boolean filteredAreUncalled = false; - - /** - * Used to generate a sites-only file. - */ - @Argument(fullName="minimalVCF", shortName="minimalVCF", doc="If true, then the output VCF will contain no INFO or genotype FORMAT fields", required=false) - public boolean minimalVCF = false; - - @Argument(fullName="excludeNonVariants", shortName="env", doc="Don't include loci found to be non-variant after the combining procedure", required=false) - public boolean EXCLUDE_NON_VARIANTS = false; - - /** - * Set to 'null' if you don't want the set field emitted. - */ - @Argument(fullName="setKey", shortName="setKey", doc="Key used in the INFO key=value tag emitted describing which set the combined VCF record came from", required=false) - public String SET_KEY = "set"; - - /** - * This option allows the user to perform a simple merge (concatenation) to combine the VCFs, drastically reducing the runtime. - */ - @Argument(fullName="assumeIdenticalSamples", shortName="assumeIdenticalSamples", doc="If true, assume input VCFs have identical sample sets and disjoint calls", required=false) - public boolean ASSUME_IDENTICAL_SAMPLES = false; - - @Argument(fullName="minimumN", shortName="minN", doc="Combine variants and output site only if the variant is present in at least N input files.", required=false) - public int minimumN = 1; - - /** - * This option allows the suppression of the command line in the VCF header. This is most often usefully when combining variants for dozens or hundreds of smaller VCFs. - */ - @Argument(fullName="suppressCommandLineHeader", shortName="suppressCommandLineHeader", doc="If true, do not output the header containing the command line used", required=false) - public boolean SUPPRESS_COMMAND_LINE_HEADER = false; - - @Argument(fullName="mergeInfoWithMaxAC", shortName="mergeInfoWithMaxAC", doc="If true, when VCF records overlap the info field is taken from the one with the max AC instead of only taking the fields which are identical across the overlapping records.", required=false) - public boolean MERGE_INFO_WITH_MAX_AC = false; - - private List priority = null; - - /** Optimization to strip out genotypes before merging if we are doing a sites_only output */ - private boolean sitesOnlyVCF = false; - private Set samples; - - public void initialize() { - Map vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit()); - - if ( vcfWriter instanceof VariantContextWriterStub) { - sitesOnlyVCF = ((VariantContextWriterStub)vcfWriter).getWriterOptions().contains(Options.DO_NOT_WRITE_GENOTYPES); - if ( sitesOnlyVCF ) logger.info("Pre-stripping genotypes for performance"); - } else - logger.warn("VCF output file not an instance of VCFWriterStub; cannot enable sites only output option"); - - validateAnnotateUnionArguments(); - if ( PRIORITY_STRING == null && genotypeMergeOption == null) { - genotypeMergeOption = GATKVariantContextUtils.GenotypeMergeType.UNSORTED; - //PRIORITY_STRING = Utils.join(",", vcfRods.keySet()); Deleted by Ami (7/10/12) - logger.info("Priority string is not provided, using arbitrary genotyping order: "+priority); - } - - if (genotypeMergeOption == GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE && - !SampleUtils.verifyUniqueSamplesNames(vcfRods)) - throw new IllegalStateException("REQUIRE_UNIQUE sample names is true but duplicate names were discovered."); - - samples = sitesOnlyVCF ? Collections.emptySet() : SampleUtils.getSampleList(vcfRods, genotypeMergeOption); - - if ( SET_KEY.toLowerCase().equals("null") ) - SET_KEY = null; - - Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true); - if ( SET_KEY != null ) - headerLines.add(new VCFInfoHeaderLine(SET_KEY, 1, VCFHeaderLineType.String, "Source VCF for the merged record in CombineVariants")); - if ( !ASSUME_IDENTICAL_SAMPLES ) - headerLines.addAll(Arrays.asList(ChromosomeCountConstants.descriptions)); - VCFHeader vcfHeader = new VCFHeader(headerLines, samples); - vcfHeader.setWriteCommandLine(!SUPPRESS_COMMAND_LINE_HEADER); - vcfWriter.writeHeader(vcfHeader); - } - - private void validateAnnotateUnionArguments() { - Set rodNames = SampleUtils.getRodNamesWithVCFHeader(getToolkit(), null); - - if ( genotypeMergeOption == GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE && PRIORITY_STRING == null ) - throw new UserException.MissingArgument("rod_priority_list", "Priority string must be provided if you want to prioritize genotypes"); - - if ( PRIORITY_STRING != null){ - priority = new ArrayList<>(Arrays.asList(PRIORITY_STRING.split(","))); - if ( rodNames.size() != priority.size() ) - throw new UserException.BadArgumentValue("rod_priority_list", "The priority list must contain exactly one rod binding per ROD provided to the GATK: rodNames=" + rodNames + " priority=" + priority); - - if ( ! rodNames.containsAll(priority) ) - throw new UserException.BadArgumentValue("rod_priority_list", "Not all priority elements provided as input RODs: " + PRIORITY_STRING); - } - - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) // RodWalkers can make funky map calls - return 0; - - final Set rodNames = SampleUtils.getRodNamesWithVCFHeader(getToolkit(), null); - // get all of the vcf rods at this locus - // Need to provide reference bases to simpleMerge starting at current locus - Collection vcs = tracker.getValues(variants, context.getLocation()); - - if ( sitesOnlyVCF ) { - vcs = VariantContextUtils.sitesOnlyVariantContexts(vcs); - } - - if ( ASSUME_IDENTICAL_SAMPLES ) { - for ( final VariantContext vc : vcs ) { - vcfWriter.add(vc); - } - - return vcs.isEmpty() ? 0 : 1; - } - - int numFilteredRecords = 0; - for (final VariantContext vc : vcs) { - if (vc.filtersWereApplied() && vc.isFiltered()) - numFilteredRecords++; - } - - if (minimumN > 1 && (vcs.size() - numFilteredRecords < minimumN)) - return 0; - - final List mergedVCs = new ArrayList<>(); - - if (multipleAllelesMergeType == GATKVariantContextUtils.MultipleAllelesMergeType.BY_TYPE) { - final Map> VCsByType = GATKVariantContextUtils.separateVariantContextsByType(vcs); - - // TODO -- clean this up in a refactoring - // merge NO_VARIATION into another type of variant (based on the ordering in VariantContext.Type) - if ( VCsByType.containsKey(VariantContext.Type.NO_VARIATION) && VCsByType.size() > 1 ) { - final List refs = VCsByType.remove(VariantContext.Type.NO_VARIATION); - for ( final VariantContext.Type type : VariantContext.Type.values() ) { - if ( VCsByType.containsKey(type) ) { - VCsByType.get(type).addAll(refs); - break; - } - } - } - - // iterate over the types so that it's deterministic - for (final VariantContext.Type type : VariantContext.Type.values()) { - // make sure that it is a variant or in case it is not, that we want to include the sites with no variants - if (!EXCLUDE_NON_VARIANTS || !type.equals(VariantContext.Type.NO_VARIATION)) { - if (VCsByType.containsKey(type)) { - mergedVCs.add(GATKVariantContextUtils.simpleMerge(VCsByType.get(type), priority, rodNames.size(), - filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, - SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); - } - } - } - } - else if (multipleAllelesMergeType == GATKVariantContextUtils.MultipleAllelesMergeType.MIX_TYPES) { - mergedVCs.add(GATKVariantContextUtils.simpleMerge(vcs, priority, rodNames.size(), filteredRecordsMergeType, - genotypeMergeOption, true, printComplexMerges, SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); - } - else { - logger.warn("Ignoring all records at site " + ref.getLocus()); - } - - for ( final VariantContext mergedVC : mergedVCs ) { - // only operate at the start of events - if ( mergedVC == null ) - continue; - - final VariantContextBuilder builder = new VariantContextBuilder(mergedVC); - // re-compute chromosome counts - VariantContextUtils.calculateChromosomeCounts(builder, false); - - if ( minimalVCF ) - GATKVariantContextUtils.pruneVariantContext(builder, Arrays.asList(SET_KEY)); - final VariantContext vc = builder.make(); - if( !EXCLUDE_NON_VARIANTS || vc.isPolymorphicInSamples() ) - vcfWriter.add(builder.make()); - } - - return vcs.isEmpty() ? 0 : 1; - } - - public Integer reduceInit() { - return 0; - } - - public Integer reduce(Integer counter, Integer sum) { - return counter + sum; - } - - @Override - public Integer treeReduce(Integer lhs, Integer rhs) { - return reduce(lhs, rhs); - } - - public void onTraversalDone(Integer sum) {} -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java deleted file mode 100755 index 724578a09..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java +++ /dev/null @@ -1,600 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.walkers.variantutils; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.commandline.RodBinding; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.report.GATKReport; -import org.broadinstitute.sting.gatk.report.GATKReportTable; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.sting.utils.variant.GATKVCFUtils; -import org.broadinstitute.variant.variantcontext.*; -import org.broadinstitute.variant.vcf.VCFHeader; - -import java.io.PrintStream; -import java.util.*; - -/** - * Genotype concordance (per-sample and aggregate counts and frequencies, NRD/NRS and site allele overlaps) between two callsets - * - *

- * GenotypeConcordance takes in two callsets (vcfs) and tabulates the number of sites which overlap and share alleles, - * and for each sample, the genotype-by-genotype counts (for instance, the number of sites at which a sample was - * called homozygous reference in the EVAL callset, but homozygous variant in the COMP callset). It outputs these - * counts as well as convenient proportions (such as the proportion of het calls in the EVAL which were called REF in - * the COMP) and metrics (such as NRD and NRS). - * - *

Input

- *

- * Genotype concordance requires two callsets (as it does a comparison): an EVAL and a COMP callset, specified via - * the -eval and -comp arguments. - * - * (Optional) Jexl expressions for genotype-level filtering of EVAL or COMP genotypes, specified via the -gfe and - * -cfe arguments, respectively. - *

- * - *

Output

- * Genotype Concordance writes a GATK report to the specified file (via -o) , consisting of multiple tables of counts - * and proportions. These tables may be optionally moltenized via the -moltenize argument. That is, the standard table - * - *
- *  Sample   NO_CALL_HOM_REF  NO_CALL_HET  NO_CALL_HOM_VAR   (...)
- *  NA12878       0.003        0.001            0.000        (...)
- *  NA12891       0.005        0.000            0.000        (...)
- *  
- * - * would instead be displayed - * - *
- *  NA12878  NO_CALL_HOM_REF   0.003
- *  NA12878  NO_CALL_HET       0.001
- *  NA12878  NO_CALL_HOM_VAR   0.000
- *  NA12891  NO_CALL_HOM_REF   0.005
- *  NA12891  NO_CALL_HET       0.000
- *  NA12891  NO_CALL_HOM_VAR   0.000
- *  (...)
- *  
- * - * - * These tables are constructed on a per-sample basis, and include counts of eval vs comp genotype states, and the - * number of times the alternate alleles between the eval and comp sample did not match up. - * - * In addition, Genotype Concordance produces site-level allelic concordance. For strictly bi-allelic VCFs, - * only the ALLELES_MATCH, EVAL_ONLY, TRUTH_ONLY fields will be populated, but where multi-allelic sites are involved - * counts for EVAL_SUBSET_TRUTH and EVAL_SUPERSET_TRUTH will be generated. - * - * For example, in the following situation - *
- *    eval:  ref - A   alt - C
- *    comp:  ref - A   alt - C,T
- *  
- * then the site is tabulated as EVAL_SUBSET_TRUTH. Were the situation reversed, it would be EVAL_SUPERSET_TRUTH. - * However, in the case where eval has both C and T alternate alleles, both must be observed in the genotypes - * (that is, there must be at least one of (0/1,1/1) and at least one of (0/2,1/2,2/2) in the genotype field). If - * one of the alleles has no observations in the genotype fields of the eval, the site-level concordance is - * tabulated as though that allele were not present in the record. - * - *

Monomorphic Records

- * A site which has an alternate allele, but which is monomorphic in samples, is treated as not having been - * discovered, and will be recorded in the TRUTH_ONLY column (if a record exists in the comp VCF), or not at all - * (if no record exists in the comp VCF). - * - * That is, in the situation - *
- *   eval:  ref - A   alt - C   genotypes - 0/0  0/0  0/0 ... 0/0
- *   comp:  ref - A   alt - C   ...         0/0  0/0  ...
- *  
- * is equivalent to - *
- *   eval:  ref - A   alt - .   genotypes - 0/0  0/0  0/0 ... 0/0
- *   comp:  ref - A   alt - C   ...         0/0  0/0  ...
- *  
- * - * When a record is present in the comp VCF the *genotypes* for the monomorphic site will still be used to evaluate - * per-sample genotype concordance counts. - * - *

Filtered Records

- * Filtered records are treated as though they were not present in the VCF, unless -ignoreSiteFilters is provided, - * in which case all records are used. There is currently no way to assess concordance metrics on filtered sites - * exclusively. SelectVariants can be used to extract filtered sites, and VariantFiltration used to un-filter them. - */ -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) -public class GenotypeConcordance extends RodWalker>,ConcordanceMetrics> { - - /** - * The callset you want to evaluate, typically this is where you'd put 'unassessed' callsets. - */ - @Input(fullName="eval",shortName="eval",doc="The variants and genotypes to evaluate",required=true) - RodBinding evalBinding; - - /** - * The callset you want to treat as 'truth'. Can also be of unknown quality for the sake of callset comparisons. - */ - @Input(fullName="comp",shortName="comp",doc="The variants and genotypes to compare against",required=true) - RodBinding compBinding; - - /** - * The FILTER field of the eval and comp VCFs will be ignored. If this flag is not included, all FILTER sites will - * be treated as not being present in the VCF. (That is, the genotypes will be assigned UNAVAILABLE, as distinct - * from NO_CALL). - */ - @Argument(fullName="ignoreFilters",doc="Filters will be ignored",required=false) - boolean ignoreFilters = false; - - /** - * A genotype level JEXL expression to apply to eval genotypes. Genotypes filtered in this way will be replaced by NO_CALL. - * For instance: -gfe 'GQ<20' will set to no-call any genotype with genotype quality less than 20. - */ - @Argument(shortName="gfe", fullName="genotypeFilterExpressionEval", doc="One or more criteria to use to set EVAL genotypes to no-call. "+ - "These genotype-level filters are only applied to the EVAL rod.", required=false) - public ArrayList genotypeFilterExpressionsEval = new ArrayList(); - - /** - * Identical to -gfe except the filter is applied to genotypes in the comp rod. - */ - @Argument(shortName="gfc", fullName="genotypeFilterExpressionComp", doc="One or more criteria to use to set COMP genotypes to no-call. "+ - "These genotype-level filters are only applied to the COMP rod.", required=false) - public ArrayList genotypeFilterExpressionsComp = new ArrayList(); - - /** - * Moltenize the count and proportion tables. Rather than moltenizing per-sample data into a 2x2 table, it is fully - * moltenized into elements. That is, WITHOUT this argument, each row of the table begins with the sample name and - * proceeds directly with counts/proportions of eval/comp counts (for instance HOM_REF/HOM_REF, HOM_REF/NO_CALL). - * - * If the Moltenize argument is given, the output will begin with a sample name, followed by the contrastive genotype - * type (such as HOM_REF/HOM_REF), followed by the count or proportion. This will significantly increase the number of - * rows. - */ - @Argument(shortName="moltenize",fullName="moltenize",doc="Molten rather than tabular output") - public boolean moltenize = false; - - @Output - PrintStream out; - - private List evalSamples; - private List compSamples; - private List evalJexls = null; - private List compJexls = null; - - // todo -- table with "proportion of overlapping sites" (not just eval/comp margins) [e.g. drop no-calls] - // (this will break all the integration tests of course, due to new formatting) - - public void initialize() { - evalJexls = initializeJexl(genotypeFilterExpressionsEval); - compJexls = initializeJexl(genotypeFilterExpressionsComp); - } - - private List initializeJexl(ArrayList genotypeFilterExpressions) { - ArrayList dummyNames = new ArrayList(genotypeFilterExpressions.size()); - int expCount = 1; - for ( String exp : genotypeFilterExpressions ) { - dummyNames.add(String.format("gfe%d",expCount++)); - } - return VariantContextUtils.initializeMatchExps(dummyNames, genotypeFilterExpressions); - } - - public ConcordanceMetrics reduceInit() { - Map headerMap = GATKVCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList(evalBinding,compBinding)); - VCFHeader evalHeader = headerMap.get(evalBinding.getName()); - evalSamples = evalHeader.getGenotypeSamples(); - VCFHeader compHeader = headerMap.get(compBinding.getName()); - compSamples = compHeader.getGenotypeSamples(); - return new ConcordanceMetrics(evalHeader,compHeader); - } - - - public List> map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - List> evalCompPair = new ArrayList>(3); - if ( tracker != null && ( - tracker.getValues(evalBinding,ref.getLocus()).size() > 0 || - tracker.getValues(compBinding,ref.getLocus()).size() > 0 ) ) { - - List eval = tracker.getValues(evalBinding,ref.getLocus()); - List comp = tracker.getValues(compBinding,ref.getLocus()); - if ( eval.size() > 1 || comp.size() > 1 ) { - if ( noDuplicateTypes(eval) && noDuplicateTypes(comp) ) { - logger.info("Eval or Comp Rod at position " + ref.getLocus().toString() + " has multiple records. Resolving."); - evalCompPair = resolveMultipleRecords(eval,comp); - } else { - logger.warn("Eval or Comp Rod at position "+ref.getLocus().toString()+" has multiple records of the same type. This locus will be skipped."); - } - } else { - // if a rod is missing, explicitly create a variant context with 'missing' genotypes. Slow, but correct. - // note that if there is no eval rod there must be a comp rod, and also the reverse - VariantContext evalContext = eval.size() == 1 ? eval.get(0) : createEmptyContext(comp.get(0),evalSamples); - VariantContext compContext = comp.size() == 1 ? comp.get(0) : createEmptyContext(eval.get(0),compSamples); - evalContext = filterGenotypes(evalContext,ignoreFilters,evalJexls); - compContext = filterGenotypes(compContext,ignoreFilters,compJexls); - evalCompPair.add(new Pair(evalContext,compContext)); - } - } - - return evalCompPair; - } - - private boolean noDuplicateTypes(List vcList) { - HashSet types = new HashSet(vcList.size()); - for ( VariantContext vc : vcList ) { - VariantContext.Type type = vc.getType(); - if ( types.contains(type) ) - return false; - types.add(type); - } - - return true; - } - - /** - * The point of this method is to match up pairs of evals and comps by their type (or alternate alleles for mixed). - * Basically multiple records could exist for a site such as: - * Eval: 20 4000 A C - * Eval: 20 4000 A AC - * Comp: 20 4000 A C - * So for each eval, loop through the comps. If the types match, or for mixed types if eval alleles (non-emptily) - * intersect the comp alleles, pair them up and remove that comp records. - * Continue until we're out of evals or comps. This is n^2, but should rarely actually happen. - * - * The remaining unpaired records get paird with an empty contexts. So in the example above we'd get a list of: - * 1 - (20,4000,A/C | 20,4000,A/C) - * 2 - (20,4000,A/AC | Empty ) - * @param evalList - list of eval variant contexts - * @param compList - list of comp variant contexts - * @return resolved pairs of the input lists - */ - private List> resolveMultipleRecords(List evalList, List compList) { - List> resolvedPairs = new ArrayList>(evalList.size()+compList.size()); // oversized but w/e - List pairedEval = new ArrayList(evalList.size()); - for ( VariantContext eval : evalList ) { - VariantContext.Type evalType = eval.getType(); - Set evalAlleles = new HashSet(eval.getAlternateAlleles()); - VariantContext pairedComp = null; - for ( VariantContext comp : compList ) { - if ( evalType.equals(comp.getType()) ) { - pairedComp = comp; - break; - } else if ( eval.isMixed() || comp.isMixed() ) { - for ( Allele compAllele : comp.getAlternateAlleles() ) { - if ( evalAlleles.contains(compAllele) ) { - pairedComp = comp; - break; - } - } - } - } - if ( pairedComp != null ) { - compList.remove(pairedComp); - resolvedPairs.add(new Pair(filterGenotypes(eval,ignoreFilters,evalJexls),filterGenotypes(pairedComp,ignoreFilters,compJexls))); - pairedEval.add(eval); - if ( compList.size() < 1 ) - break; - } - } - evalList.removeAll(pairedEval); - for ( VariantContext unpairedEval : evalList ) { - resolvedPairs.add(new Pair(filterGenotypes(unpairedEval,ignoreFilters,evalJexls),createEmptyContext(unpairedEval,compSamples))); - } - - for ( VariantContext unpairedComp : compList ) { - resolvedPairs.add(new Pair(createEmptyContext(unpairedComp,evalSamples),filterGenotypes(unpairedComp,ignoreFilters,compJexls))); - } - - return resolvedPairs; - } - - public ConcordanceMetrics reduce(List> evalCompList, ConcordanceMetrics metrics) { - for ( Pair evalComp : evalCompList) - metrics.update(evalComp.getFirst(),evalComp.getSecond()); - return metrics; - } - - private static double repairNaN(double d) { - if ( Double.isNaN(d) ) { - return 0.0; - } - return d; - } - - public void onTraversalDone(ConcordanceMetrics metrics) { - // todo -- this is over 200 lines of code just to format the output and could use some serious cleanup - GATKReport report = new GATKReport(); - GATKReportTable concordanceCounts = new GATKReportTable("GenotypeConcordance_Counts","Per-sample concordance tables: comparison counts",2+GenotypeType.values().length*GenotypeType.values().length); - GATKReportTable concordanceEvalProportions = new GATKReportTable("GenotypeConcordance_EvalProportions", "Per-sample concordance tables: proportions of genotypes called in eval",2+GenotypeType.values().length*GenotypeType.values().length); - GATKReportTable concordanceCompProportions = new GATKReportTable("GenotypeConcordance_CompProportions", "Per-sample concordance tables: proportions of genotypes called in comp",2+GenotypeType.values().length*GenotypeType.values().length); - GATKReportTable concordanceSummary = new GATKReportTable("GenotypeConcordance_Summary","Per-sample summary statistics: NRS, NRD, and OGC",2); - GATKReportTable siteConcordance = new GATKReportTable("SiteConcordance_Summary","Site-level summary statistics",ConcordanceMetrics.SiteConcordanceType.values().length); - if ( moltenize ) { - concordanceCompProportions.addColumn("Sample","%s"); - concordanceCounts.addColumn("Sample","%s"); - concordanceEvalProportions.addColumn("Sample","%s"); - concordanceSummary.addColumn("Sample","%s"); - - concordanceCompProportions.addColumn("Eval_Genotype","%s"); - concordanceCounts.addColumn("Eval_Genotype","%s"); - concordanceEvalProportions.addColumn("Eval_Genotype","%s"); - concordanceSummary.addColumn("Non-Reference_Discrepancy","%.3f"); - - concordanceCompProportions.addColumn("Comp_Genotype","%s"); - concordanceCounts.addColumn("Comp_Genotype","%s"); - concordanceEvalProportions.addColumn("Comp_Genotype","%s"); - concordanceSummary.addColumn("Non-Reference_Sensitivity","%.3f"); - - concordanceCompProportions.addColumn("Proportion","%.3f"); - concordanceCounts.addColumn("Count","%d"); - concordanceEvalProportions.addColumn("Proportion","%.3f"); - concordanceSummary.addColumn("Overall_Genotype_Concordance","%.3f"); - - for ( Map.Entry entry : metrics.getPerSampleGenotypeConcordance().entrySet() ) { - ConcordanceMetrics.GenotypeConcordanceTable table = entry.getValue(); - for ( GenotypeType evalType : GenotypeType.values() ) { - for ( GenotypeType compType : GenotypeType.values() ) { - String rowKey = String.format("%s_%s_%s",entry.getKey(),evalType.toString(),compType.toString()); - concordanceCounts.set(rowKey,"Sample",entry.getKey()); - concordanceCounts.set(rowKey,"Eval_Genotype",evalType.toString()); - concordanceCounts.set(rowKey,"Comp_Genotype",compType.toString()); - int count = table.get(evalType, compType); - concordanceCounts.set(rowKey,"Count",count); - if ( evalType == GenotypeType.HET || evalType == GenotypeType.HOM_REF || evalType == GenotypeType.HOM_VAR) { - concordanceEvalProportions.set(rowKey,"Sample",entry.getKey()); - concordanceEvalProportions.set(rowKey,"Eval_Genotype",evalType.toString()); - concordanceEvalProportions.set(rowKey,"Comp_Genotype",compType.toString()); - concordanceEvalProportions.set(rowKey,"Proportion",repairNaN(( (double) count)/table.getnEvalGenotypes(evalType))); - } - if ( compType == GenotypeType.HET || compType == GenotypeType.HOM_VAR || compType == GenotypeType.HOM_REF ) { - concordanceCompProportions.set(rowKey,"Sample",entry.getKey()); - concordanceCompProportions.set(rowKey,"Eval_Genotype",evalType.toString()); - concordanceCompProportions.set(rowKey,"Comp_Genotype",compType.toString()); - concordanceCompProportions.set(rowKey,"Proportion",repairNaN(( (double) count)/table.getnCompGenotypes(compType))); - } - } - } - String mismatchKey = String.format("%s_%s",entry.getKey(),"Mismatching"); - concordanceCounts.set(mismatchKey,"Sample",entry.getKey()); - concordanceCounts.set(mismatchKey,"Eval_Genotype","Mismatching_Alleles"); - concordanceCounts.set(mismatchKey,"Comp_Genotype","Mismatching_Alleles"); - concordanceEvalProportions.set(mismatchKey,"Sample",entry.getKey()); - concordanceEvalProportions.set(mismatchKey,"Eval_Genotype","Mismatching_Alleles"); - concordanceEvalProportions.set(mismatchKey,"Comp_Genotype","Mismatching_Alleles"); - concordanceCompProportions.set(mismatchKey,"Sample",entry.getKey()); - concordanceCompProportions.set(mismatchKey,"Eval_Genotype","Mismatching_Alleles"); - concordanceCompProportions.set(mismatchKey,"Comp_Genotype","Mismatching_Alleles"); - concordanceEvalProportions.set(mismatchKey,"Proportion", repairNaN(( (double) table.getnMismatchingAlt() )/table.getnCalledEvalGenotypes())); - concordanceCompProportions.set(mismatchKey,"Proportion", repairNaN(( (double) table.getnMismatchingAlt() )/table.getnCalledCompGenotypes())); - concordanceCounts.set(mismatchKey,"Count",table.getnMismatchingAlt()); - } - - String sampleKey = "ALL"; - ConcordanceMetrics.GenotypeConcordanceTable table = metrics.getOverallGenotypeConcordance(); - for ( GenotypeType evalType : GenotypeType.values() ) { - for ( GenotypeType compType : GenotypeType.values() ) { - String rowKey = String.format("%s_%s_%s",sampleKey,evalType.toString(),compType.toString()); - concordanceCounts.set(rowKey,"Sample",sampleKey); - concordanceCounts.set(rowKey,"Eval_Genotype",evalType.toString()); - concordanceCounts.set(rowKey,"Comp_Genotype",compType.toString()); - int count = table.get(evalType, compType); - concordanceCounts.set(rowKey,"Count",count); - if ( evalType == GenotypeType.HET || evalType == GenotypeType.HOM_REF || evalType == GenotypeType.HOM_VAR) { - concordanceEvalProportions.set(rowKey,"Sample",sampleKey); - concordanceEvalProportions.set(rowKey,"Eval_Genotype",evalType.toString()); - concordanceEvalProportions.set(rowKey,"Comp_Genotype",compType.toString()); - concordanceEvalProportions.set(rowKey,"Proportion",repairNaN(( (double) count)/table.getnEvalGenotypes(evalType))); - } - if ( compType == GenotypeType.HET || compType == GenotypeType.HOM_VAR || compType == GenotypeType.HOM_REF ) { - concordanceCompProportions.set(rowKey,"Sample",sampleKey); - concordanceCompProportions.set(rowKey,"Eval_Genotype",evalType.toString()); - concordanceCompProportions.set(rowKey,"Comp_Genotype",compType.toString()); - concordanceCompProportions.set(rowKey,"Proportion",repairNaN(( (double) count)/table.getnCompGenotypes(compType))); - } - } - } - String rowKey = String.format("%s_%s",sampleKey,"Mismatching"); - concordanceCounts.set(rowKey,"Sample",sampleKey); - concordanceCounts.set(rowKey,"Eval_Genotype","Mismatching_Alleles"); - concordanceCounts.set(rowKey,"Comp_Genotype","Mismatching_Alleles"); - concordanceEvalProportions.set(rowKey,"Sample",sampleKey); - concordanceEvalProportions.set(rowKey,"Eval_Genotype","Mismatching_Alleles"); - concordanceEvalProportions.set(rowKey,"Comp_Genotype","Mismatching_Alleles"); - concordanceCompProportions.set(rowKey,"Sample",sampleKey); - concordanceCompProportions.set(rowKey,"Eval_Genotype","Mismatching_Alleles"); - concordanceCompProportions.set(rowKey,"Comp_Genotype","Mismatching_Alleles"); - concordanceEvalProportions.set(rowKey,"Proportion", repairNaN(( (double) table.getnMismatchingAlt() )/table.getnCalledEvalGenotypes())); - concordanceCompProportions.set(rowKey,"Proportion", repairNaN(( (double) table.getnMismatchingAlt() )/table.getnCalledCompGenotypes())); - concordanceCounts.set(rowKey,"Count",table.getnMismatchingAlt()); - - for ( Map.Entry nrsEntry : metrics.getPerSampleNRS().entrySet() ) { - concordanceSummary.set(nrsEntry.getKey(),"Sample",nrsEntry.getKey()); - concordanceSummary.set(nrsEntry.getKey(),"Non-Reference_Sensitivity",nrsEntry.getValue()); - } - for ( Map.Entry nrdEntry : metrics.getPerSampleNRD().entrySet() ) { - concordanceSummary.set(nrdEntry.getKey(),"Non-Reference_Discrepancy",nrdEntry.getValue()); - } - for ( Map.Entry ogcEntry : metrics.getPerSampleOGC().entrySet() ) { - concordanceSummary.set(ogcEntry.getKey(),"Overall_Genotype_Concordance",ogcEntry.getValue()); - } - concordanceSummary.set("ALL_NRS_NRD","Sample","ALL"); - concordanceSummary.set("ALL_NRS_NRD","Non-Reference_Sensitivity",metrics.getOverallNRS()); - concordanceSummary.set("ALL_NRS_NRD","Non-Reference_Discrepancy",metrics.getOverallNRD()); - concordanceSummary.set("ALL_NRS_NRD","Overall_Genotype_Concordance",metrics.getOverallOGC()); - - - for (ConcordanceMetrics.SiteConcordanceType type : ConcordanceMetrics.SiteConcordanceType.values() ) { - siteConcordance.addColumn(type.toString(),"%d"); - } - - for (ConcordanceMetrics.SiteConcordanceType type : ConcordanceMetrics.SiteConcordanceType.values() ) { - siteConcordance.set("Comparison",type.toString(),metrics.getOverallSiteConcordance().get(type)); - } - - } else { - concordanceCompProportions.addColumn("Sample","%s"); - concordanceCounts.addColumn("Sample","%s"); - concordanceEvalProportions.addColumn("Sample","%s"); - concordanceSummary.addColumn("Sample","%s"); - for ( GenotypeType evalType : GenotypeType.values() ) { - for ( GenotypeType compType : GenotypeType.values() ) { - String colKey = String.format("%s_%s", evalType.toString(), compType.toString()); - concordanceCounts.addColumn(colKey,"%d"); - if ( evalType == GenotypeType.HET || evalType == GenotypeType.HOM_REF || evalType == GenotypeType.HOM_VAR) - concordanceEvalProportions.addColumn(colKey,"%.3f"); - if ( compType == GenotypeType.HET || compType == GenotypeType.HOM_VAR || compType == GenotypeType.HOM_REF ) - concordanceCompProportions.addColumn(colKey,"%.3f"); - } - } - concordanceEvalProportions.addColumn("Mismatching_Alleles","%.3f"); - concordanceCompProportions.addColumn("Mismatching_Alleles","%.3f"); - concordanceCounts.addColumn("Mismatching_Alleles","%d"); - concordanceSummary.addColumn("Non-Reference Sensitivity","%.3f"); - concordanceSummary.addColumn("Non-Reference Discrepancy","%.3f"); - concordanceSummary.addColumn("Overall_Genotype_Concordance","%.3f"); - for (ConcordanceMetrics.SiteConcordanceType type : ConcordanceMetrics.SiteConcordanceType.values() ) { - siteConcordance.addColumn(type.toString(),"%d"); - } - - for ( Map.Entry entry : metrics.getPerSampleGenotypeConcordance().entrySet() ) { - ConcordanceMetrics.GenotypeConcordanceTable table = entry.getValue(); - concordanceEvalProportions.set(entry.getKey(),"Sample",entry.getKey()); - concordanceCompProportions.set(entry.getKey(),"Sample",entry.getKey()); - concordanceCounts.set(entry.getKey(),"Sample",entry.getKey()); - for ( GenotypeType evalType : GenotypeType.values() ) { - for ( GenotypeType compType : GenotypeType.values() ) { - String colKey = String.format("%s_%s",evalType.toString(),compType.toString()); - int count = table.get(evalType, compType); - concordanceCounts.set(entry.getKey(),colKey,count); - if ( evalType == GenotypeType.HET || evalType == GenotypeType.HOM_REF || evalType == GenotypeType.HOM_VAR) - concordanceEvalProportions.set(entry.getKey(),colKey,repairNaN(( (double) count)/table.getnEvalGenotypes(evalType))); - if ( compType == GenotypeType.HET || compType == GenotypeType.HOM_VAR || compType == GenotypeType.HOM_REF ) - concordanceCompProportions.set(entry.getKey(),colKey,repairNaN(( (double) count)/table.getnCompGenotypes(compType))); - } - } - concordanceEvalProportions.set(entry.getKey(),"Mismatching_Alleles", repairNaN(( (double) table.getnMismatchingAlt() )/table.getnCalledEvalGenotypes())); - concordanceCompProportions.set(entry.getKey(),"Mismatching_Alleles", repairNaN(( (double) table.getnMismatchingAlt() )/table.getnCalledCompGenotypes())); - concordanceCounts.set(entry.getKey(),"Mismatching_Alleles",table.getnMismatchingAlt()); - } - - String rowKey = "ALL"; - concordanceCompProportions.set(rowKey,"Sample",rowKey); - concordanceEvalProportions.set(rowKey,"Sample",rowKey); - concordanceCounts.set(rowKey,"Sample",rowKey); - ConcordanceMetrics.GenotypeConcordanceTable table = metrics.getOverallGenotypeConcordance(); - for ( GenotypeType evalType : GenotypeType.values() ) { - for ( GenotypeType compType : GenotypeType.values() ) { - String colKey = String.format("%s_%s",evalType.toString(),compType.toString()); - int count = table.get(evalType,compType); - concordanceCounts.set(rowKey,colKey,count); - if ( evalType == GenotypeType.HET || evalType == GenotypeType.HOM_REF || evalType == GenotypeType.HOM_VAR) - concordanceEvalProportions.set(rowKey,colKey,repairNaN(( (double) count)/table.getnEvalGenotypes(evalType))); - if ( compType == GenotypeType.HET || compType == GenotypeType.HOM_VAR || compType == GenotypeType.HOM_REF ) - concordanceCompProportions.set(rowKey,colKey,repairNaN(( (double) count)/table.getnCompGenotypes(compType))); - } - } - concordanceEvalProportions.set(rowKey,"Mismatching_Alleles", repairNaN(( (double) table.getnMismatchingAlt() )/table.getnCalledEvalGenotypes())); - concordanceCompProportions.set(rowKey,"Mismatching_Alleles", repairNaN(( (double) table.getnMismatchingAlt() )/table.getnCalledCompGenotypes())); - concordanceCounts.set(rowKey,"Mismatching_Alleles",table.getnMismatchingAlt()); - - for ( Map.Entry nrsEntry : metrics.getPerSampleNRS().entrySet() ) { - concordanceSummary.set(nrsEntry.getKey(),"Sample",nrsEntry.getKey()); - concordanceSummary.set(nrsEntry.getKey(),"Non-Reference Sensitivity",nrsEntry.getValue()); - } - for ( Map.Entry nrdEntry : metrics.getPerSampleNRD().entrySet() ) { - concordanceSummary.set(nrdEntry.getKey(),"Non-Reference Discrepancy",nrdEntry.getValue()); - } - for ( Map.Entry ogcEntry : metrics.getPerSampleOGC().entrySet() ) { - concordanceSummary.set(ogcEntry.getKey(),"Overall_Genotype_Concordance",ogcEntry.getValue()); - } - concordanceSummary.set("ALL","Sample","ALL"); - concordanceSummary.set("ALL","Non-Reference Sensitivity",metrics.getOverallNRS()); - concordanceSummary.set("ALL","Non-Reference Discrepancy",metrics.getOverallNRD()); - concordanceSummary.set("ALL","Overall_Genotype_Concordance",metrics.getOverallOGC()); - - for (ConcordanceMetrics.SiteConcordanceType type : ConcordanceMetrics.SiteConcordanceType.values() ) { - siteConcordance.set("Comparison",type.toString(),metrics.getOverallSiteConcordance().get(type)); - } - } - - report.addTable(concordanceCompProportions); - report.addTable(concordanceEvalProportions); - report.addTable(concordanceCounts); - report.addTable(concordanceSummary); - report.addTable(siteConcordance); - - report.print(out); - } - - public VariantContext createEmptyContext(VariantContext other, List samples) { - VariantContextBuilder builder = new VariantContextBuilder(); - // set the alleles to be the same - builder.alleles(other.getAlleles()); - builder.loc(other.getChr(),other.getStart(),other.getEnd()); - // set all genotypes to empty - List genotypes = new ArrayList(samples.size()); - for ( String sample : samples ) - genotypes.add(GenotypeBuilder.create(sample, new ArrayList(0))); - builder.genotypes(genotypes); - return builder.make(); - } - - public VariantContext filterGenotypes(VariantContext context, boolean ignoreSiteFilter, List exps) { - if ( ! context.isFiltered() || ignoreSiteFilter ) { - List filteredGenotypes = new ArrayList(context.getNSamples()); - for ( Genotype g : context.getGenotypes() ) { - Map matchMap = VariantContextUtils.match(context, g, exps); - boolean filtered = false; - for ( Boolean b : matchMap.values() ) { - if ( b ) { - filtered = true; - break; - } - } - if ( filtered ) { - filteredGenotypes.add(GenotypeBuilder.create(g.getSampleName(),Arrays.asList(Allele.NO_CALL,Allele.NO_CALL),g.getExtendedAttributes())); - } else { - filteredGenotypes.add(g); - } - } - VariantContextBuilder builder = new VariantContextBuilder(context); - builder.genotypes(filteredGenotypes); - return builder.make(); - } - - VariantContextBuilder builder = new VariantContextBuilder(); - builder.alleles(Arrays.asList(context.getReference())); - builder.loc(context.getChr(),context.getStart(),context.getEnd()); - List newGeno = new ArrayList(context.getNSamples()); - for ( Genotype g : context.getGenotypes().iterateInSampleNameOrder() ) { - newGeno.add(GenotypeBuilder.create(g.getSampleName(),new ArrayList())); - } - builder.genotypes(newGeno); - return builder.make(); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariants.java deleted file mode 100644 index 9168d17f0..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariants.java +++ /dev/null @@ -1,299 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.walkers.variantutils; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import net.sf.samtools.Cigar; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.ArgumentCollection; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.Reference; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.gatk.walkers.Window; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.vcf.VCFHeader; -import org.broadinstitute.variant.vcf.VCFHeaderLine; -import org.broadinstitute.sting.utils.variant.GATKVCFUtils; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.sam.AlignmentUtils; -import org.broadinstitute.variant.variantcontext.*; -import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; -import org.broadinstitute.variant.variantcontext.writer.VariantContextWriterFactory; - -import java.util.*; - -/** - * Left-aligns indels from a variants file. - * - *

- * LeftAlignAndTrimVariants is a tool that takes a VCF file and left-aligns the indels inside it. The same indel can often be - * placed at multiple positions and still represent the same haplotype. While the standard convention with VCF is to - * place an indel at the left-most position this doesn't always happen, so this tool can be used to left-align them. - * Note that this tool cannot handle anything other than bi-allelic, simple indels. Complex events are written out unchanged. - * Optionally, the tool will also trim common bases from indels, leaving them with a minimum representation. - * - *

Input

- *

- * A variant set to left-align and trim. - *

- * - *

Output

- *

- * A left-aligned VCF. - *

- * - *

Examples

- *
- * java -Xmx2g -jar GenomeAnalysisTK.jar \
- *   -R ref.fasta \
- *   -T LeftAlignAndTrimVariants \
- *   --variant input.vcf \
- *   -o output.vcf
- * 
- * - */ -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) -@Reference(window=@Window(start=-200,stop=200)) // WARNING: if this changes,MAX_INDEL_LENGTH needs to change as well! -public class LeftAlignAndTrimVariants extends RodWalker { - - @ArgumentCollection - protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); - - /** - * If this argument is set, bases common to all alleles will be removed, leaving only their minimal representation. - */ - @Argument(fullName="trimAlleles", shortName="trim", doc="Trim alleles to remove bases common to all of them", required=false) - protected boolean trimAlleles = false; - - /** - * If this argument is set, split multiallelic records and left-align individual alleles. - * If this argument is not set, multiallelic records are not attempted to left-align and will be copied as is. - */ - @Argument(fullName="splitMultiallelics", shortName="split", doc="Split multiallelic records and left-align individual alleles", required=false) - protected boolean splitMultiallelics = false; - - - @Output(doc="File to which variants should be written") - protected VariantContextWriter baseWriter = null; - - private VariantContextWriter writer; - - private static final int MAX_INDEL_LENGTH = 200; // needs to match reference window size! - public void initialize() { - String trackName = variantCollection.variants.getName(); - Set samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(trackName)); - Map vcfHeaders = GATKVCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList(trackName)); - - Set headerLines = vcfHeaders.get(trackName).getMetaDataInSortedOrder(); - baseWriter.writeHeader(new VCFHeader(headerLines, samples)); - - writer = VariantContextWriterFactory.sortOnTheFly(baseWriter, 200); - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) - return 0; - - Collection VCs = tracker.getValues(variantCollection.variants, context.getLocation()); - - int changedSites = 0; - for ( final VariantContext vc : VCs ) { - // split first into biallelics, and optionally trim alleles to minimal representation - Pair result = new Pair(vc,0); // default value - if (splitMultiallelics) { - final List vcList = GATKVariantContextUtils.splitVariantContextToBiallelics( vc); - for (final VariantContext biallelicVC: vcList) { - final VariantContext v = (trimAlleles ? GATKVariantContextUtils.trimAlleles(biallelicVC,true,true):biallelicVC); - result = alignAndWrite(v, ref); - writer.add(result.first); - changedSites += result.second; - } - } - else { - if (trimAlleles) - result = alignAndWrite(GATKVariantContextUtils.trimAlleles(vc,true,true), ref); - else - result = alignAndWrite(vc,ref); - writer.add(result.first); - changedSites += result.second; - - } - - } - - return changedSites; - } - - public Integer reduceInit() { return 0; } - - public Integer reduce(Integer value, Integer sum) { - return sum + value; - } - - public void onTraversalDone(Integer result) { - writer.close(); - System.out.println(result + " variants were aligned"); - } - - /** - * Main routine workhorse. By definitio, it will only take biallelic vc's. Splitting into multiple alleles has to be - * handled by calling routine. - * @param vc Input VC with variants to left align - * @param ref Reference context - * @return # of records left-aligned (0 or 1) and new VC. - */ - @Requires({"vc != null","ref != null", "vc.isBiallelic() == true","ref.getBases().length>=2*MAX_INDEL_LENGTH+1"}) - @Ensures({"result != null","result.first != null", "result.second >=0"}) - protected static Pair alignAndWrite(final VariantContext vc, final ReferenceContext ref) { - - final Pair retValue = new Pair(vc,0); - if (!vc.isIndel() || vc.isComplexIndel() ) { - return retValue; - } - - // get the indel length - final int indelLength; - if ( vc.isSimpleDeletion() ) - indelLength = vc.getReference().length() - 1; - else - indelLength = vc.getAlternateAllele(0).length() - 1; - - if ( indelLength > MAX_INDEL_LENGTH ) - return retValue; - - if (vc.getReference().getBases()[0] != vc.getAlternateAllele(0).getBases()[0]) - return retValue; - - final byte[] refSeq = ref.getBases(); - - // create an indel haplotype. - // - final int originalIndex = vc.getStart() - ref.getWindow().getStart() + 1; - if (originalIndex < 0 || originalIndex >= ref.getBases().length) - return retValue; - - final byte[] originalIndel = makeHaplotype(vc, refSeq, originalIndex, indelLength); - - // create a CIGAR string to represent the event - ArrayList elements = new ArrayList(); - elements.add(new CigarElement(originalIndex, CigarOperator.M)); - elements.add(new CigarElement(indelLength, vc.isSimpleDeletion() ? CigarOperator.D : CigarOperator.I)); - elements.add(new CigarElement(refSeq.length - originalIndex, CigarOperator.M)); - Cigar originalCigar = new Cigar(elements); - - // left align the CIGAR - Cigar newCigar = AlignmentUtils.leftAlignIndel(originalCigar, refSeq, originalIndel, 0, 0, true); - - // update if necessary and write - if ( !newCigar.equals(originalCigar) && newCigar.numCigarElements() > 1 ) { - int difference = originalIndex - newCigar.getCigarElement(0).getLength(); - VariantContext newVC = new VariantContextBuilder(vc).start(vc.getStart()-difference).stop(vc.getEnd()-difference).make(); - //System.out.println("Moving record from " + vc.getChr()+":"+vc.getStart() + " to " + vc.getChr()+":"+(vc.getStart()-difference)); - - final int indelIndex = originalIndex-difference; - final byte[] newBases = new byte[indelLength + 1]; - newBases[0] = refSeq[indelIndex-1]; - System.arraycopy((vc.isSimpleDeletion() ? refSeq : originalIndel), indelIndex, newBases, 1, indelLength); - final Allele newAllele = Allele.create(newBases, vc.isSimpleDeletion()); - newVC = updateAllele(newVC, newAllele); - // overwrite default return value with new left-aligned VC - retValue.first = newVC; - retValue.second = 1; - - } - return retValue; - } - - /** - * Make a haplotype from a given alt allele, using bases in input reference, index of an input reference - * @param vc Input VC - will use only alt allele from it - * @param ref Ref bases - * @param indexOfRef Index in ref where to create indel - * @param indelLength Indel length - * @return - */ - @Requires({"vc != null","ref != null", "indexOfRef +indelLength < ref.length", "vc.getNAlleles() == 2"}) - @Ensures("result != null") - private static byte[] makeHaplotype(VariantContext vc, byte[] ref, int indexOfRef, int indelLength) { - byte[] hap = new byte[ref.length + (indelLength * (vc.isSimpleDeletion() ? -1 : 1))]; - - // add the bases before the indel - System.arraycopy(ref, 0, hap, 0, indexOfRef); - int currentPos = indexOfRef; - - // take care of the indel - if ( vc.isSimpleDeletion() ) { - indexOfRef += indelLength; - } else { - System.arraycopy(vc.getAlternateAllele(0).getBases(), 1, hap, currentPos, indelLength); - currentPos += indelLength; - } - - // add the bases after the indel - System.arraycopy(ref, indexOfRef, hap, currentPos, ref.length - indexOfRef); - - return hap; - } - - public static VariantContext updateAllele(final VariantContext vc, final Allele newAllele) { - // create a mapping from original allele to new allele - HashMap alleleMap = new HashMap(vc.getAlleles().size()); - if ( newAllele.isReference() ) { - alleleMap.put(vc.getReference(), newAllele); - alleleMap.put(vc.getAlternateAllele(0), Allele.create(newAllele.getBases()[0], false)); - } else { - alleleMap.put(vc.getReference(), Allele.create(newAllele.getBases()[0], true)); - alleleMap.put(vc.getAlternateAllele(0), newAllele); - } - - // create new Genotype objects - GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); - for ( final Genotype genotype : vc.getGenotypes() ) { - List newAlleles = new ArrayList(); - for ( Allele allele : genotype.getAlleles() ) { - Allele newA = alleleMap.get(allele); - if ( newA == null ) - newA = Allele.NO_CALL; - newAlleles.add(newA); - } - newGenotypes.add(new GenotypeBuilder(genotype).alleles(newAlleles).make()); - } - - return new VariantContextBuilder(vc).alleles(alleleMap.values()).genotypes(newGenotypes).make(); - } -} diff --git a/public/java/src/org/broadinstitute/sting/tools/CatVariants.java b/public/java/src/org/broadinstitute/sting/tools/CatVariants.java deleted file mode 100644 index 8e5078f1f..000000000 --- a/public/java/src/org/broadinstitute/sting/tools/CatVariants.java +++ /dev/null @@ -1,290 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.tools; - -import net.sf.picard.reference.ReferenceSequenceFile; -import net.sf.picard.reference.ReferenceSequenceFileFactory; -import org.apache.log4j.BasicConfigurator; -import org.apache.log4j.Level; -import org.broad.tribble.AbstractFeatureReader; -import org.broad.tribble.FeatureReader; -import org.broad.tribble.index.IndexCreator; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.commandline.CommandLineProgram; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.sting.utils.variant.GATKVCFIndexType; -import org.broadinstitute.sting.utils.variant.GATKVCFUtils; -import org.broadinstitute.variant.bcf2.BCF2Codec; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.variant.vcf.VCFCodec; -import org.broadinstitute.variant.vcf.VCFHeader; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.variantcontext.writer.Options; -import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; -import org.broadinstitute.variant.variantcontext.writer.VariantContextWriterFactory; - -import java.io.*; -import java.util.*; - - -/** - * - * Concatenates VCF files of non-overlapped genome intervals, all with the same set of samples - * - *

- * The main purpose of this tool is to speed up the gather function when using scatter-gather parallelization. - * This tool concatenates the scattered output VCF files. It assumes that: - * - All the input VCFs (or BCFs) contain the same samples in the same order. - * - The variants in each input file are from non-overlapping (scattered) intervals. - * - * When the input files are already sorted based on the intervals start positions, use -assumeSorted. - * - * Note: Currently the tool is more efficient when working with VCFs; we will work to make it as efficient for BCFs. - * - *

- * - *

Input

- *

- * One or more variant sets to combine. They should be of non-overlapping genome intervals and with the same samples (in the same order). - * The input files should be 'name.vcf' or 'name.VCF' or 'name.bcf' or 'name.BCF'. - * If the files are ordered according to the appearance of intervals in the ref genome, then one can use the -assumeSorted flag. - *

- * - *

Output

- *

- * A combined VCF. The output file should be 'name.vcf' or 'name.VCF'. - * <\p> - * - *

Important note

- *

This is a command-line utility that bypasses the GATK engine. As a result, the command-line you must use to - * invoke it is a little different from other GATK tools (see example below), and it does not accept any of the - * classic "CommandLineGATK" arguments.

- * - *

Example

- *
- * java -cp GenomeAnalysisTK.jar org.broadinstitute.sting.tools.CatVariants \
- *    -R ref.fasta \
- *    -V input1.vcf \
- *    -V input2.vcf \
- *    -out output.vcf \
- *    -assumeSorted
- * 
- * - * @author Ami Levy Moonshine - * @since Jan 2012 - */ - -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP ) -public class CatVariants extends CommandLineProgram { - // setup the logging system, used by some codecs - private static org.apache.log4j.Logger logger = org.apache.log4j.Logger.getRootLogger(); - - @Input(fullName = "reference", shortName = "R", doc = "genome reference file .fasta", required = true) - private File refFile = null; - - /** - * The VCF or BCF files to merge together - * - * CatVariants can take any number of -V arguments on the command line. Each -V argument - * will be included in the final merged output VCF. The order of arguments does not matter, but it runs more - * efficiently if they are sorted based on the intervals and the assumeSorted argument is used. - * - */ - @Input(fullName="variant", shortName="V", doc="Input VCF file/s named .vcf or .bcf", required = true) - private List variant = null; - - @Output(fullName = "outputFile", shortName = "out", doc = "output file name .vcf or .bcf", required = true) - private File outputFile = null; - - @Argument(fullName = "assumeSorted", shortName = "assumeSorted", doc = "assumeSorted should be true if he input files are already sorted (based on the position of the variants", required = false) - private Boolean assumeSorted = false; - - @Argument(fullName = "variant_index_type", doc = "which type of IndexCreator to use for VCF/BCF indices", required = false) - private GATKVCFIndexType variant_index_type = GATKVCFUtils.DEFAULT_INDEX_TYPE; - - @Argument(fullName = "variant_index_parameter", doc = "the parameter (bin width or features per bin) to pass to the VCF/BCF IndexCreator", required = false) - private Integer variant_index_parameter = GATKVCFUtils.DEFAULT_INDEX_PARAMETER; - - /* - * print usage information - */ - private static void printUsage() { - System.err.println("Usage: java -cp dist/GenomeAnalysisTK.jar org.broadinstitute.sting.tools.CatVariants [sorted (optional)]"); - System.err.println(" The input files can be of type: VCF (ends in .vcf or .VCF)"); - System.err.println(" BCF2 (ends in .bcf or .BCF)"); - System.err.println(" Output file must be vcf or bcf file (.vcf or .bcf)"); - System.err.println(" if the input files are already sorted, the last argument can indicate that"); - } - - @Override - protected int execute() throws Exception { - //if(help){ - // printUsage(); - // return 1; - //} - - BasicConfigurator.configure(); - logger.setLevel(Level.INFO); - - final ReferenceSequenceFile ref; - try { - ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile); - } catch ( Exception e ) { - throw new UserException("Couldn't load provided reference sequence file " + refFile, e); - } - - Comparator> positionComparator = new PositionComparator(); - - - //PriorityQueue>> queue = - // new PriorityQueue>>(2000, comparator); - Queue> priorityQueue; - if(assumeSorted) - priorityQueue = new LinkedList>(); - else - priorityQueue = new PriorityQueue>(10000, positionComparator); - - Iterator files = variant.iterator(); - File file; - while (files.hasNext()) { - file = files.next(); - if (!(file.getName().endsWith(".vcf") || file.getName().endsWith(".VCF") || file.getName().endsWith(".bcf") || file.getName().endsWith(".BCF"))){ - System.err.println("File " + file.getAbsolutePath() + " should be .vcf or .bcf"); - printUsage(); - return 1; - } - if (assumeSorted){ - priorityQueue.add(new Pair(0,file)); - } - else{ - if (!file.exists()) { - throw new UserException(String.format("File %s doesn't exist",file.getAbsolutePath())); - } - FeatureReader reader; - boolean useVCF = (file.getName().endsWith(".vcf") || file.getName().endsWith(".VCF")); - if(useVCF) - reader = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), new VCFCodec(), false); - else - reader = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), new BCF2Codec(), false); - Iterator it = reader.iterator(); - if(!it.hasNext()){ - System.err.println(String.format("File %s is empty. This file will be ignored",file.getAbsolutePath())); - continue; - } - VariantContext vc = it.next(); - int firstPosition = vc.getStart(); - reader.close(); - //queue.add(new Pair>(firstPosition,reader)); - priorityQueue.add(new Pair(firstPosition,file)); - } - - } - - if (!(outputFile.getName().endsWith(".vcf") || outputFile.getName().endsWith(".VCF"))){ - throw new UserException(String.format("Output file %s should be .vcf", outputFile)); - } - - FileOutputStream outputStream = new FileOutputStream(outputFile); - EnumSet options = EnumSet.of(Options.INDEX_ON_THE_FLY); - final IndexCreator idxCreator = GATKVCFUtils.getIndexCreator(variant_index_type, variant_index_parameter, outputFile); - final VariantContextWriter outputWriter = VariantContextWriterFactory.create(outputFile, outputStream, ref.getSequenceDictionary(), idxCreator, options); - - boolean firstFile = true; - int count =0; - //while(!queue.isEmpty()){ - while(!priorityQueue.isEmpty() ){ - count++; - //FeatureReader reader = queue.remove().getSecond(); - file = priorityQueue.remove().getSecond(); - if (!file.exists()) { - throw new UserException(String.format("File %s doesn't exist",file.getAbsolutePath())); - } - FeatureReader reader; - boolean useVCF = (file.getName().endsWith(".vcf") || file.getName().endsWith(".VCF")); - if(useVCF) - reader = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), new VCFCodec(), false); - else - reader = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), new BCF2Codec(), false); - - if(count%10 ==0) - System.out.print(count); - else - System.out.print("."); - if (firstFile){ - VCFHeader header = (VCFHeader)reader.getHeader(); - outputWriter.writeHeader(header); - firstFile = false; - } - - Iterator it = reader.iterator(); - - while (it.hasNext()){ - VariantContext vc = it.next(); - outputWriter.add(vc); - } - - reader.close(); - - } - System.out.println(); - - outputStream.close(); - outputWriter.close(); - - return 0; - } - - - public static void main(String[] args){ - try { - CatVariants instance = new CatVariants(); - start(instance, args); - System.exit(CommandLineProgram.result); - } catch ( UserException e ) { - printUsage(); - exitSystemWithUserError(e); - } catch ( Exception e ) { - exitSystemWithError(e); - } - } - - private static class PositionComparator implements Comparator> { - - @Override - public int compare(Pair p1, Pair p2) { - int startPositionP1 = p1.getFirst(); - int startPositionP2 = p2.getFirst(); - if (startPositionP1 == startPositionP2) - return 0; - return startPositionP1 < startPositionP2 ? -1 : 1 ; - } - } - -} diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java deleted file mode 100644 index 82c9fe751..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ /dev/null @@ -1,1518 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.apache.commons.math.distribution.ExponentialDistribution; -import org.apache.commons.math.distribution.ExponentialDistributionImpl; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.math.BigDecimal; -import java.util.*; - -/** - * MathUtils is a static class (no instantiation allowed!) with some useful math methods. - * - * @author Kiran Garimella - */ -public class MathUtils { - - /** - * Private constructor. No instantiating this class! - */ - private MathUtils() { - } - - public static final double[] log10Cache; - public static final double[] log10FactorialCache; - private static final double[] jacobianLogTable; - private static final double JACOBIAN_LOG_TABLE_STEP = 0.0001; - private static final double JACOBIAN_LOG_TABLE_INV_STEP = 1.0 / JACOBIAN_LOG_TABLE_STEP; - private static final double MAX_JACOBIAN_TOLERANCE = 8.0; - private static final int JACOBIAN_LOG_TABLE_SIZE = (int) (MAX_JACOBIAN_TOLERANCE / JACOBIAN_LOG_TABLE_STEP) + 1; - private static final int MAXN = 70_000; - private static final int LOG10_CACHE_SIZE = 4 * MAXN; // we need to be able to go up to 2*(2N) when calculating some of the coefficients - - /** - * The smallest log10 value we'll emit from normalizeFromLog10 and other functions - * where the real-space value is 0.0. - */ - public static final double LOG10_P_OF_ZERO = -1000000.0; - public static final double FAIR_BINOMIAL_PROB_LOG10_0_5 = Math.log10(0.5); - public static final double LOG_ONE_HALF = -Math.log10(2.0); - public static final double LOG_ONE_THIRD = -Math.log10(3.0); - private static final double NATURAL_LOG_OF_TEN = Math.log(10.0); - private static final double SQUARE_ROOT_OF_TWO_TIMES_PI = Math.sqrt(2.0 * Math.PI); - - static { - log10Cache = new double[LOG10_CACHE_SIZE]; - log10FactorialCache = new double[LOG10_CACHE_SIZE]; - jacobianLogTable = new double[JACOBIAN_LOG_TABLE_SIZE]; - - log10Cache[0] = Double.NEGATIVE_INFINITY; - log10FactorialCache[0] = 0.0; - for (int k = 1; k < LOG10_CACHE_SIZE; k++) { - log10Cache[k] = Math.log10(k); - log10FactorialCache[k] = log10FactorialCache[k-1] + log10Cache[k]; - } - - for (int k = 0; k < JACOBIAN_LOG_TABLE_SIZE; k++) { - jacobianLogTable[k] = Math.log10(1.0 + Math.pow(10.0, -((double) k) * JACOBIAN_LOG_TABLE_STEP)); - - } - } - - /** - * Get a random int between min and max (inclusive) using the global GATK random number generator - * - * @param min lower bound of the range - * @param max upper bound of the range - * @return a random int >= min and <= max - */ - public static int randomIntegerInRange( final int min, final int max ) { - return GenomeAnalysisEngine.getRandomGenerator().nextInt(max - min + 1) + min; - } - - // A fast implementation of the Math.round() method. This method does not perform - // under/overflow checking, so this shouldn't be used in the general case (but is fine - // if one is already make those checks before calling in to the rounding). - public static int fastRound(final double d) { - return (d > 0.0) ? (int) (d + 0.5d) : (int) (d - 0.5d); - } - - public static double approximateLog10SumLog10(final double[] vals) { - return approximateLog10SumLog10(vals, vals.length); - } - - public static double approximateLog10SumLog10(final double[] vals, final int endIndex) { - - final int maxElementIndex = MathUtils.maxElementIndex(vals, endIndex); - double approxSum = vals[maxElementIndex]; - - for (int i = 0; i < endIndex; i++) { - if (i == maxElementIndex || vals[i] == Double.NEGATIVE_INFINITY) - continue; - - final double diff = approxSum - vals[i]; - if (diff < MathUtils.MAX_JACOBIAN_TOLERANCE) { - // See notes from the 2-inout implementation below - final int ind = fastRound(diff * MathUtils.JACOBIAN_LOG_TABLE_INV_STEP); // hard rounding - approxSum += MathUtils.jacobianLogTable[ind]; - } - } - - return approxSum; - } - - public static double approximateLog10SumLog10(final double a, final double b, final double c) { - return approximateLog10SumLog10(a, approximateLog10SumLog10(b, c)); - } - - public static double approximateLog10SumLog10(double small, double big) { - // make sure small is really the smaller value - if (small > big) { - final double t = big; - big = small; - small = t; - } - - if (small == Double.NEGATIVE_INFINITY || big == Double.NEGATIVE_INFINITY) - return big; - - final double diff = big - small; - if (diff >= MathUtils.MAX_JACOBIAN_TOLERANCE) - return big; - - // OK, so |y-x| < tol: we use the following identity then: - // we need to compute log10(10^x + 10^y) - // By Jacobian logarithm identity, this is equal to - // max(x,y) + log10(1+10^-abs(x-y)) - // we compute the second term as a table lookup with integer quantization - // we have pre-stored correction for 0,0.1,0.2,... 10.0 - final int ind = fastRound(diff * MathUtils.JACOBIAN_LOG_TABLE_INV_STEP); // hard rounding - return big + MathUtils.jacobianLogTable[ind]; - } - - public static double sum(final double[] values) { - double s = 0.0; - for (double v : values) - s += v; - return s; - } - - public static long sum(final int[] x) { - long total = 0; - for (int v : x) - total += v; - return total; - } - - public static int sum(final byte[] x) { - int total = 0; - for (byte v : x) - total += (int)v; - return total; - } - - public static double percentage(int x, int base) { - return (base > 0 ? ((double) x / (double) base) * 100.0 : 0); - } - - public static double ratio(final int num, final int denom) { - if ( denom > 0 ) { - return ((double) num)/denom; - } else { - if ( num == 0 && denom == 0) { - return 0.0; - } else { - throw new ReviewedStingException(String.format("The denominator of a ratio cannot be zero or less than zero: %d/%d",num,denom)); - } - } - } - - public static double ratio(final long num, final long denom) { - if ( denom > 0L ) { - return ((double) num)/denom; - } else { - if ( num == 0L && denom == 0L ) { - return 0.0; - } else { - throw new ReviewedStingException(String.format("The denominator of a ratio cannot be zero or less than zero: %d/%d",num,denom)); - } - } - } - - /** - * Converts a real space array of numbers (typically probabilities) into a log10 array - * - * @param prRealSpace - * @return - */ - public static double[] toLog10(final double[] prRealSpace) { - double[] log10s = new double[prRealSpace.length]; - for (int i = 0; i < prRealSpace.length; i++) { - log10s[i] = Math.log10(prRealSpace[i]); - } - return log10s; - } - - public static double log10sumLog10(final double[] log10p, final int start) { - return log10sumLog10(log10p, start, log10p.length); - } - - public static double log10sumLog10(final double[] log10p,final int start,final int finish) { - double sum = 0.0; - - double maxValue = arrayMax(log10p, finish); - if(maxValue == Double.NEGATIVE_INFINITY) - return maxValue; - - for (int i = start; i < finish; i++) { - if ( Double.isNaN(log10p[i]) || log10p[i] == Double.POSITIVE_INFINITY ) { - throw new IllegalArgumentException("log10p: Values must be non-infinite and non-NAN"); - } - sum += Math.pow(10.0, log10p[i] - maxValue); - } - - return Math.log10(sum) + maxValue; - } - - public static double sumLog10(final double[] log10values) { - return Math.pow(10.0, log10sumLog10(log10values)); - } - - public static double log10sumLog10(final double[] log10values) { - return log10sumLog10(log10values, 0); - } - - public static boolean wellFormedDouble(final double val) { - return !Double.isInfinite(val) && !Double.isNaN(val); - } - - public static double bound(final double value, final double minBoundary, final double maxBoundary) { - return Math.max(Math.min(value, maxBoundary), minBoundary); - } - - public static boolean isBounded(final double val, final double lower, final double upper) { - return val >= lower && val <= upper; - } - - public static boolean isPositive(final double val) { - return !isNegativeOrZero(val); - } - - public static boolean isPositiveOrZero(final double val) { - return isBounded(val, 0.0, Double.POSITIVE_INFINITY); - } - - public static boolean isNegativeOrZero(final double val) { - return isBounded(val, Double.NEGATIVE_INFINITY, 0.0); - } - - public static boolean isNegative(final double val) { - return !isPositiveOrZero(val); - } - - /** - * Compares double values for equality (within 1e-6), or inequality. - * - * @param a the first double value - * @param b the second double value - * @return -1 if a is greater than b, 0 if a is equal to be within 1e-6, 1 if b is greater than a. - */ - public static byte compareDoubles(final double a, final double b) { - return compareDoubles(a, b, 1e-6); - } - - /** - * Compares double values for equality (within epsilon), or inequality. - * - * @param a the first double value - * @param b the second double value - * @param epsilon the precision within which two double values will be considered equal - * @return -1 if a is greater than b, 0 if a is equal to be within epsilon, 1 if b is greater than a. - */ - public static byte compareDoubles(final double a, final double b, final double epsilon) { - if (Math.abs(a - b) < epsilon) { - return 0; - } - if (a > b) { - return -1; - } - return 1; - } - - /** - * Calculate f(x) = Normal(x | mu = mean, sigma = sd) - * @param mean the desired mean of the Normal distribution - * @param sd the desired standard deviation of the Normal distribution - * @param x the value to evaluate - * @return a well-formed double - */ - public static double normalDistribution(final double mean, final double sd, final double x) { - if( sd < 0 ) - throw new IllegalArgumentException("sd: Standard deviation of normal must be >0"); - if ( ! wellFormedDouble(mean) || ! wellFormedDouble(sd) || ! wellFormedDouble(x) ) - throw new IllegalArgumentException("mean, sd, or, x : Normal parameters must be well formatted (non-INF, non-NAN)"); - double a = 1.0 / (sd * Math.sqrt(2.0 * Math.PI)); - double b = Math.exp(-1.0 * (Math.pow(x - mean, 2.0) / (2.0 * sd * sd))); - return a * b; - } - - /** - * Calculate f(x) = log10 ( Normal(x | mu = mean, sigma = sd) ) - * @param mean the desired mean of the Normal distribution - * @param sd the desired standard deviation of the Normal distribution - * @param x the value to evaluate - * @return a well-formed double - */ - - public static double normalDistributionLog10(final double mean, final double sd, final double x) { - if( sd < 0 ) - throw new IllegalArgumentException("sd: Standard deviation of normal must be >0"); - if ( ! wellFormedDouble(mean) || ! wellFormedDouble(sd) || ! wellFormedDouble(x) ) - throw new IllegalArgumentException("mean, sd, or, x : Normal parameters must be well formatted (non-INF, non-NAN)"); - final double a = -1.0 * Math.log10(sd * SQUARE_ROOT_OF_TWO_TIMES_PI); - final double b = -1.0 * (square(x - mean) / (2.0 * square(sd))) / NATURAL_LOG_OF_TEN; - return a + b; - } - - /** - * Calculate f(x) = x^2 - * @param x the value to square - * @return x * x - */ - public static double square(final double x) { - return x * x; - } - - /** - * Calculates the log10 of the binomial coefficient. Designed to prevent - * overflows even with very large numbers. - * - * @param n total number of trials - * @param k number of successes - * @return the log10 of the binomial coefficient - */ - public static double binomialCoefficient(final int n, final int k) { - return Math.pow(10, log10BinomialCoefficient(n, k)); - } - - /** - * @see #binomialCoefficient(int, int) with log10 applied to result - */ - public static double log10BinomialCoefficient(final int n, final int k) { - if ( n < 0 ) { - throw new IllegalArgumentException("n: Must have non-negative number of trials"); - } - if ( k > n || k < 0 ) { - throw new IllegalArgumentException("k: Must have non-negative number of successes, and no more successes than number of trials"); - } - - return log10Factorial(n) - log10Factorial(k) - log10Factorial(n - k); - } - - /** - * Computes a binomial probability. This is computed using the formula - *

- * B(k; n; p) = [ n! / ( k! (n - k)! ) ] (p^k)( (1-p)^k ) - *

- * where n is the number of trials, k is the number of successes, and p is the probability of success - * - * @param n number of Bernoulli trials - * @param k number of successes - * @param p probability of success - * @return the binomial probability of the specified configuration. Computes values down to about 1e-237. - */ - public static double binomialProbability(final int n, final int k, final double p) { - return Math.pow(10, log10BinomialProbability(n, k, Math.log10(p))); - } - - /** - * @see #binomialProbability(int, int, double) with log10 applied to result - */ - public static double log10BinomialProbability(final int n, final int k, final double log10p) { - if ( log10p > 1e-18 ) - throw new IllegalArgumentException("log10p: Log-probability must be 0 or less"); - double log10OneMinusP = Math.log10(1 - Math.pow(10, log10p)); - return log10BinomialCoefficient(n, k) + log10p * k + log10OneMinusP * (n - k); - } - - /** - * @see #binomialProbability(int, int, double) with p=0.5 - */ - public static double binomialProbability(final int n, final int k) { - return Math.pow(10, log10BinomialProbability(n, k)); - } - - /** - * @see #binomialProbability(int, int, double) with p=0.5 and log10 applied to result - */ - public static double log10BinomialProbability(final int n, final int k) { - return log10BinomialCoefficient(n, k) + (n * FAIR_BINOMIAL_PROB_LOG10_0_5); - } - - /** A memoization container for {@link #binomialCumulativeProbability(int, int, int)}. Synchronized to accomodate multithreading. */ - private static final Map BINOMIAL_CUMULATIVE_PROBABILITY_MEMOIZATION_CACHE = - Collections.synchronizedMap(new LRUCache(10_000)); - - /** - * Primitive integer-triplet bijection into long. Returns null when the bijection function fails (in lieu of an exception), which will - * happen when: any value is negative or larger than a short. This method is optimized for speed; it is not intended to serve as a - * utility function. - */ - static Long fastGenerateUniqueHashFromThreeIntegers(final int one, final int two, final int three) { - if (one < 0 || two < 0 || three < 0 || Short.MAX_VALUE < one || Short.MAX_VALUE < two || Short.MAX_VALUE < three) { - return null; - } else { - long result = 0; - result += (short) one; - result <<= 16; - result += (short) two; - result <<= 16; - result += (short) three; - return result; - } - } - - /** - * Performs the cumulative sum of binomial probabilities, where the probability calculation is done in log space. - * Assumes that the probability of a successful hit is fair (i.e. 0.5). - * - * This pure function is memoized because of its expensive BigDecimal calculations. - * - * @param n number of attempts for the number of hits - * @param k_start start (inclusive) of the cumulant sum (over hits) - * @param k_end end (inclusive) of the cumulant sum (over hits) - * @return - returns the cumulative probability - */ - public static double binomialCumulativeProbability(final int n, final int k_start, final int k_end) { - if ( k_end > n ) - throw new IllegalArgumentException(String.format("Value for k_end (%d) is greater than n (%d)", k_end, n)); - - // Fetch cached value, if applicable. - final Long memoizationKey = fastGenerateUniqueHashFromThreeIntegers(n, k_start, k_end); - final Double memoizationCacheResult; - if (memoizationKey != null) { - memoizationCacheResult = BINOMIAL_CUMULATIVE_PROBABILITY_MEMOIZATION_CACHE.get(memoizationKey); - } else { - memoizationCacheResult = null; - } - - final double result; - if (memoizationCacheResult != null) { - result = memoizationCacheResult; - } else { - double cumProb = 0.0; - double prevProb; - BigDecimal probCache = BigDecimal.ZERO; - - for (int hits = k_start; hits <= k_end; hits++) { - prevProb = cumProb; - final double probability = binomialProbability(n, hits); - cumProb += probability; - if (probability > 0 && cumProb - prevProb < probability / 2) { // loss of precision - probCache = probCache.add(new BigDecimal(prevProb)); - cumProb = 0.0; - hits--; // repeat loop - // prevProb changes at start of loop - } - } - - result = probCache.add(new BigDecimal(cumProb)).doubleValue(); - if (memoizationKey != null) { - BINOMIAL_CUMULATIVE_PROBABILITY_MEMOIZATION_CACHE.put(memoizationKey, result); - } - } - return result; - } - - /** - * Calculates the log10 of the multinomial coefficient. Designed to prevent - * overflows even with very large numbers. - * - * @param n total number of trials - * @param k array of any size with the number of successes for each grouping (k1, k2, k3, ..., km) - * @return - */ - public static double log10MultinomialCoefficient(final int n, final int[] k) { - if ( n < 0 ) - throw new IllegalArgumentException("n: Must have non-negative number of trials"); - double denominator = 0.0; - int sum = 0; - for (int x : k) { - if ( x < 0 ) - throw new IllegalArgumentException("x element of k: Must have non-negative observations of group"); - if ( x > n ) - throw new IllegalArgumentException("x element of k, n: Group observations must be bounded by k"); - denominator += log10Factorial(x); - sum += x; - } - if ( sum != n ) - throw new IllegalArgumentException("k and n: Sum of observations in multinomial must sum to total number of trials"); - return log10Factorial(n) - denominator; - } - - /** - * Computes the log10 of the multinomial distribution probability given a vector - * of log10 probabilities. Designed to prevent overflows even with very large numbers. - * - * @param n number of trials - * @param k array of number of successes for each possibility - * @param log10p array of log10 probabilities - * @return - */ - public static double log10MultinomialProbability(final int n, final int[] k, final double[] log10p) { - if (log10p.length != k.length) - throw new IllegalArgumentException("p and k: Array of log10 probabilities must have the same size as the array of number of sucesses: " + log10p.length + ", " + k.length); - double log10Prod = 0.0; - for (int i = 0; i < log10p.length; i++) { - if ( log10p[i] > 1e-18 ) - throw new IllegalArgumentException("log10p: Log-probability must be <= 0"); - log10Prod += log10p[i] * k[i]; - } - return log10MultinomialCoefficient(n, k) + log10Prod; - } - - /** - * Computes a multinomial coefficient efficiently avoiding overflow even for large numbers. - * This is computed using the formula: - *

- * M(x1,x2,...,xk; n) = [ n! / (x1! x2! ... xk!) ] - *

- * where xi represents the number of times outcome i was observed, n is the number of total observations. - * In this implementation, the value of n is inferred as the sum over i of xi. - * - * @param k an int[] of counts, where each element represents the number of times a certain outcome was observed - * @return the multinomial of the specified configuration. - */ - public static double multinomialCoefficient(final int[] k) { - int n = 0; - for (int xi : k) { - n += xi; - } - - return Math.pow(10, log10MultinomialCoefficient(n, k)); - } - - /** - * Computes a multinomial probability efficiently avoiding overflow even for large numbers. - * This is computed using the formula: - *

- * M(x1,x2,...,xk; n; p1,p2,...,pk) = [ n! / (x1! x2! ... xk!) ] (p1^x1)(p2^x2)(...)(pk^xk) - *

- * where xi represents the number of times outcome i was observed, n is the number of total observations, and - * pi represents the probability of the i-th outcome to occur. In this implementation, the value of n is - * inferred as the sum over i of xi. - * - * @param k an int[] of counts, where each element represents the number of times a certain outcome was observed - * @param p a double[] of probabilities, where each element represents the probability a given outcome can occur - * @return the multinomial probability of the specified configuration. - */ - public static double multinomialProbability(final int[] k, final double[] p) { - if (p.length != k.length) - throw new IllegalArgumentException("p and k: Array of log10 probabilities must have the same size as the array of number of sucesses: " + p.length + ", " + k.length); - - int n = 0; - double[] log10P = new double[p.length]; - for (int i = 0; i < p.length; i++) { - log10P[i] = Math.log10(p[i]); - n += k[i]; - } - return Math.pow(10, log10MultinomialProbability(n, k, log10P)); - } - - /** - * calculate the Root Mean Square of an array of integers - * - * @param x an byte[] of numbers - * @return the RMS of the specified numbers. - */ - public static double rms(final byte[] x) { - if (x.length == 0) - return 0.0; - - double rms = 0.0; - for (int i : x) - rms += i * i; - rms /= x.length; - return Math.sqrt(rms); - } - - /** - * calculate the Root Mean Square of an array of integers - * - * @param x an int[] of numbers - * @return the RMS of the specified numbers. - */ - public static double rms(final int[] x) { - if (x.length == 0) - return 0.0; - - double rms = 0.0; - for (int i : x) - rms += i * i; - rms /= x.length; - return Math.sqrt(rms); - } - - /** - * calculate the Root Mean Square of an array of doubles - * - * @param x a double[] of numbers - * @return the RMS of the specified numbers. - */ - public static double rms(final Double[] x) { - if (x.length == 0) - return 0.0; - - double rms = 0.0; - for (Double i : x) - rms += i * i; - rms /= x.length; - return Math.sqrt(rms); - } - - public static double rms(final Collection l) { - if (l.size() == 0) - return 0.0; - - double rms = 0.0; - for (int i : l) - rms += i * i; - rms /= l.size(); - return Math.sqrt(rms); - } - - public static double distanceSquared(final double[] x, final double[] y) { - double dist = 0.0; - for (int iii = 0; iii < x.length; iii++) { - dist += (x[iii] - y[iii]) * (x[iii] - y[iii]); - } - return dist; - } - - public static double round(final double num, final int digits) { - double result = num * Math.pow(10.0, (double) digits); - result = Math.round(result); - result = result / Math.pow(10.0, (double) digits); - return result; - } - - /** - * normalizes the log10-based array. ASSUMES THAT ALL ARRAY ENTRIES ARE <= 0 (<= 1 IN REAL-SPACE). - * - * @param array the array to be normalized - * @param takeLog10OfOutput if true, the output will be transformed back into log10 units - * @return a newly allocated array corresponding the normalized values in array, maybe log10 transformed - */ - public static double[] normalizeFromLog10(final double[] array, final boolean takeLog10OfOutput) { - return normalizeFromLog10(array, takeLog10OfOutput, false); - } - - /** - * See #normalizeFromLog10 but with the additional option to use an approximation that keeps the calculation always in log-space - * - * @param array - * @param takeLog10OfOutput - * @param keepInLogSpace - * - * @return - */ - public static double[] normalizeFromLog10(final double[] array, final boolean takeLog10OfOutput, final boolean keepInLogSpace) { - // for precision purposes, we need to add (or really subtract, since they're - // all negative) the largest value; also, we need to convert to normal-space. - double maxValue = arrayMax(array); - - // we may decide to just normalize in log space without converting to linear space - if (keepInLogSpace) { - for (int i = 0; i < array.length; i++) { - array[i] -= maxValue; - } - return array; - } - - // default case: go to linear space - double[] normalized = new double[array.length]; - - for (int i = 0; i < array.length; i++) - normalized[i] = Math.pow(10, array[i] - maxValue); - - // normalize - double sum = 0.0; - for (int i = 0; i < array.length; i++) - sum += normalized[i]; - for (int i = 0; i < array.length; i++) { - double x = normalized[i] / sum; - if (takeLog10OfOutput) { - x = Math.log10(x); - if ( x < LOG10_P_OF_ZERO || Double.isInfinite(x) ) - x = array[i] - maxValue; - } - - normalized[i] = x; - } - - return normalized; - } - - /** - * normalizes the log10-based array. ASSUMES THAT ALL ARRAY ENTRIES ARE <= 0 (<= 1 IN REAL-SPACE). - * - * @param array the array to be normalized - * @return a newly allocated array corresponding the normalized values in array - */ - public static double[] normalizeFromLog10(final double[] array) { - return normalizeFromLog10(array, false); - } - - /** - * normalizes the real-space probability array. - * - * Does not assume anything about the values in the array, beyond that no elements are below 0. It's ok - * to have values in the array of > 1, or have the sum go above 0. - * - * @param array the array to be normalized - * @return a newly allocated array corresponding the normalized values in array - */ - @Requires("array != null") - @Ensures({"result != null"}) - public static double[] normalizeFromRealSpace(final double[] array) { - if ( array.length == 0 ) - return array; - - final double sum = sum(array); - final double[] normalized = new double[array.length]; - if ( sum < 0.0 ) throw new IllegalArgumentException("Values in probability array sum to a negative number " + sum); - for ( int i = 0; i < array.length; i++ ) { - normalized[i] = array[i] / sum; - } - return normalized; - } - - public static int maxElementIndex(final double[] array) { - return maxElementIndex(array, array.length); - } - - public static int maxElementIndex(final double[] array, final int endIndex) { - if (array == null || array.length == 0) - throw new IllegalArgumentException("Array cannot be null!"); - - int maxI = 0; - for (int i = 1; i < endIndex; i++) { - if (array[i] > array[maxI]) - maxI = i; - } - - return maxI; - } - - public static int maxElementIndex(final int[] array) { - return maxElementIndex(array, array.length); - } - - public static int maxElementIndex(final byte[] array) { - return maxElementIndex(array, array.length); - } - - public static int maxElementIndex(final int[] array, final int endIndex) { - if (array == null || array.length == 0) - throw new IllegalArgumentException("Array cannot be null!"); - - int maxI = 0; - for (int i = 1; i < endIndex; i++) { - if (array[i] > array[maxI]) - maxI = i; - } - - return maxI; - } - - public static int maxElementIndex(final byte[] array, final int endIndex) { - if (array == null || array.length == 0) - throw new IllegalArgumentException("Array cannot be null!"); - - int maxI = 0; - for (int i = 1; i < endIndex; i++) { - if (array[i] > array[maxI]) - maxI = i; - } - - return maxI; - } - - public static int arrayMax(final int[] array) { - return array[maxElementIndex(array)]; - } - - - public static double arrayMax(final double[] array) { - return array[maxElementIndex(array)]; - } - - public static double arrayMax(final double[] array, final int endIndex) { - return array[maxElementIndex(array, endIndex)]; - } - - public static double arrayMin(final double[] array) { - return array[minElementIndex(array)]; - } - - public static int arrayMin(final int[] array) { - return array[minElementIndex(array)]; - } - - public static byte arrayMin(final byte[] array) { - return array[minElementIndex(array)]; - } - - /** - * Compute the min element of a List - * @param array a non-empty list of integer - * @return the min - */ - public static int arrayMin(final List array) { - if ( array == null || array.isEmpty() ) throw new IllegalArgumentException("Array must be non-null and non-empty"); - int min = array.get(0); - for ( final int i : array ) - if ( i < min ) min = i; - return min; - } - - /** - * Compute the median element of the list of integers - * @param array a list of integers - * @return the median element - */ - public static > T median(final List array) { - /* TODO -- from Valentin - the current implementation is not the usual median when the input is of even length. More concretely it returns the ith element of the list where i = floor(input.size() / 2). - - But actually that is not the "usual" definition of a median, as it is supposed to return the average of the two middle values when the sample length is an even number (i.e. median(1,2,3,4,5,6) == 3.5). [Sources: R and wikipedia] - - My suggestion for a solution is then: - - unify median and medianDoubles to public static T median(Collection) - check on null elements and throw an exception if there are any or perhaps return a null; documented in the javadoc. - relocate, rename and refactor MathUtils.median(X) to Utils.ithElement(X,X.size()/2) - In addition, the current median implementation sorts the whole input list witch is O(n log n). However find out the ith element (thus calculate the median) can be done in O(n) - */ - if ( array == null ) throw new IllegalArgumentException("Array must be non-null"); - final int size = array.size(); - if ( size == 0 ) throw new IllegalArgumentException("Array cannot have size 0"); - else if ( size == 1 ) return array.get(0); - else { - final ArrayList sorted = new ArrayList<>(array); - Collections.sort(sorted); - return sorted.get(size / 2); - } - } - - public static int minElementIndex(final double[] array) { - if (array == null || array.length == 0) - throw new IllegalArgumentException("Array cannot be null!"); - - int minI = 0; - for (int i = 1; i < array.length; i++) { - if (array[i] < array[minI]) - minI = i; - } - - return minI; - } - - public static int minElementIndex(final byte[] array) { - if (array == null || array.length == 0) - throw new IllegalArgumentException("Array cannot be null!"); - - int minI = 0; - for (int i = 1; i < array.length; i++) { - if (array[i] < array[minI]) - minI = i; - } - - return minI; - } - - public static int minElementIndex(final int[] array) { - if (array == null || array.length == 0) - throw new IllegalArgumentException("Array cannot be null!"); - - int minI = 0; - for (int i = 1; i < array.length; i++) { - if (array[i] < array[minI]) - minI = i; - } - - return minI; - } - - public static int arrayMaxInt(final List array) { - if (array == null) - throw new IllegalArgumentException("Array cannot be null!"); - if (array.size() == 0) - throw new IllegalArgumentException("Array size cannot be 0!"); - - int m = array.get(0); - for (int e : array) - m = Math.max(m, e); - return m; - } - - public static int sum(final List list ) { - int sum = 0; - for ( Integer i : list ) { - sum += i; - } - return sum; - } - - public static double average(final List vals, final int maxI) { - long sum = 0L; - - int i = 0; - for (long x : vals) { - if (i > maxI) - break; - sum += x; - i++; - } - - return (1.0 * sum) / i; - } - - public static double average(final List vals) { - return average(vals, vals.size()); - } - - public static int countOccurrences(final char c, final String s) { - int count = 0; - for (int i = 0; i < s.length(); i++) { - count += s.charAt(i) == c ? 1 : 0; - } - return count; - } - - public static int countOccurrences(T x, List l) { - int count = 0; - for (T y : l) { - if (x.equals(y)) - count++; - } - - return count; - } - - public static int countOccurrences(byte element, byte[] array) { - int count = 0; - for (byte y : array) { - if (element == y) - count++; - } - - return count; - } - - public static int countOccurrences(final boolean element, final boolean[] array) { - int count = 0; - for (final boolean b : array) { - if (element == b) - count++; - } - - return count; - } - - - /** - * Returns n random indices drawn with replacement from the range 0..(k-1) - * - * @param n the total number of indices sampled from - * @param k the number of random indices to draw (with replacement) - * @return a list of k random indices ranging from 0 to (n-1) with possible duplicates - */ - static public ArrayList sampleIndicesWithReplacement(final int n, final int k) { - - ArrayList chosen_balls = new ArrayList(k); - for (int i = 0; i < k; i++) { - //Integer chosen_ball = balls[rand.nextInt(k)]; - chosen_balls.add(GenomeAnalysisEngine.getRandomGenerator().nextInt(n)); - //balls.remove(chosen_ball); - } - - return chosen_balls; - } - - /** - * Returns n random indices drawn without replacement from the range 0..(k-1) - * - * @param n the total number of indices sampled from - * @param k the number of random indices to draw (without replacement) - * @return a list of k random indices ranging from 0 to (n-1) without duplicates - */ - static public ArrayList sampleIndicesWithoutReplacement(final int n, final int k) { - ArrayList chosen_balls = new ArrayList(k); - - for (int i = 0; i < n; i++) { - chosen_balls.add(i); - } - - Collections.shuffle(chosen_balls, GenomeAnalysisEngine.getRandomGenerator()); - - //return (ArrayList) chosen_balls.subList(0, k); - return new ArrayList(chosen_balls.subList(0, k)); - } - - /** - * Given a list of indices into a list, return those elements of the list with the possibility of drawing list elements multiple times - * - * @param indices the list of indices for elements to extract - * @param list the list from which the elements should be extracted - * @param the template type of the ArrayList - * @return a new ArrayList consisting of the elements at the specified indices - */ - static public ArrayList sliceListByIndices(final List indices, final List list) { - ArrayList subset = new ArrayList(); - - for (int i : indices) { - subset.add(list.get(i)); - } - - return subset; - } - - /** - * Given two log-probability vectors, compute log of vector product of them: - * in Matlab notation, return log10(10.*x'*10.^y) - * @param x vector 1 - * @param y vector 2 - * @return a double representing log (dotProd(10.^x,10.^y) - */ - public static double logDotProduct(final double [] x, final double[] y) { - if (x.length != y.length) - throw new ReviewedStingException("BUG: Vectors of different lengths"); - - double tmpVec[] = new double[x.length]; - - for (int k=0; k < tmpVec.length; k++ ) { - tmpVec[k] = x[k]+y[k]; - } - - return log10sumLog10(tmpVec); - - - - } - - /** - * Check that the log10 prob vector vector is well formed - * - * @param vector - * @param expectedSize - * @param shouldSumToOne - * - * @return true if vector is well-formed, false otherwise - */ - public static boolean goodLog10ProbVector(final double[] vector, final int expectedSize, final boolean shouldSumToOne) { - if ( vector.length != expectedSize ) return false; - - for ( final double pr : vector ) { - if ( ! goodLog10Probability(pr) ) - return false; - } - - if ( shouldSumToOne && compareDoubles(sumLog10(vector), 1.0, 1e-4) != 0 ) - return false; - - return true; // everything is good - } - - /** - * Checks that the result is a well-formed log10 probability - * - * @param result a supposedly well-formed log10 probability value. By default allows - * -Infinity values, as log10(0.0) == -Infinity. - * @return true if result is really well formed - */ - public static boolean goodLog10Probability(final double result) { - return goodLog10Probability(result, true); - } - - /** - * Checks that the result is a well-formed log10 probability - * - * @param result a supposedly well-formed log10 probability value - * @param allowNegativeInfinity should we consider a -Infinity value ok? - * @return true if result is really well formed - */ - public static boolean goodLog10Probability(final double result, final boolean allowNegativeInfinity) { - return result <= 0.0 && result != Double.POSITIVE_INFINITY && (allowNegativeInfinity || result != Double.NEGATIVE_INFINITY) && ! Double.isNaN(result); - } - - /** - * Checks that the result is a well-formed probability - * - * @param result a supposedly well-formed probability value - * @return true if result is really well formed - */ - public static boolean goodProbability(final double result) { - return result >= 0.0 && result <= 1.0 && ! Double.isInfinite(result) && ! Double.isNaN(result); - } - - /** - * A utility class that computes on the fly average and standard deviation for a stream of numbers. - * The number of observations does not have to be known in advance, and can be also very big (so that - * it could overflow any naive summation-based scheme or cause loss of precision). - * Instead, adding a new number observed - * to a sample with add(observed) immediately updates the instance of this object so that - * it contains correct mean and standard deviation for all the numbers seen so far. Source: Knuth, vol.2 - * (see also e.g. http://www.johndcook.com/standard_deviation.html for online reference). - */ - public static class RunningAverage { - private double mean = 0.0; - private double s = 0.0; - private long obs_count = 0; - - public void add(double obs) { - obs_count++; - double oldMean = mean; - mean += (obs - mean) / obs_count; // update mean - s += (obs - oldMean) * (obs - mean); - } - - public void addAll(Collection col) { - for (Number o : col) { - add(o.doubleValue()); - } - } - - public double mean() { - return mean; - } - - public double stddev() { - return Math.sqrt(s / (obs_count - 1)); - } - - public double var() { - return s / (obs_count - 1); - } - - public long observationCount() { - return obs_count; - } - - public RunningAverage clone() { - RunningAverage ra = new RunningAverage(); - ra.mean = this.mean; - ra.s = this.s; - ra.obs_count = this.obs_count; - return ra; - } - - public void merge(RunningAverage other) { - if (this.obs_count > 0 || other.obs_count > 0) { // if we have any observations at all - this.mean = (this.mean * this.obs_count + other.mean * other.obs_count) / (this.obs_count + other.obs_count); - this.s += other.s; - } - this.obs_count += other.obs_count; - } - } - - // - // useful common utility routines - // - - static public double max(double x0, double x1, double x2) { - double a = Math.max(x0, x1); - return Math.max(a, x2); - } - - /** - * Converts LN to LOG10 - * - * @param ln log(x) - * @return log10(x) - */ - public static double lnToLog10(final double ln) { - return ln * Math.log10(Math.E); - } - - /** - * Constants to simplify the log gamma function calculation. - */ - private static final double zero = 0.0, one = 1.0, half = .5, a0 = 7.72156649015328655494e-02, a1 = 3.22467033424113591611e-01, a2 = 6.73523010531292681824e-02, a3 = 2.05808084325167332806e-02, a4 = 7.38555086081402883957e-03, a5 = 2.89051383673415629091e-03, a6 = 1.19270763183362067845e-03, a7 = 5.10069792153511336608e-04, a8 = 2.20862790713908385557e-04, a9 = 1.08011567247583939954e-04, a10 = 2.52144565451257326939e-05, a11 = 4.48640949618915160150e-05, tc = 1.46163214496836224576e+00, tf = -1.21486290535849611461e-01, tt = -3.63867699703950536541e-18, t0 = 4.83836122723810047042e-01, t1 = -1.47587722994593911752e-01, t2 = 6.46249402391333854778e-02, t3 = -3.27885410759859649565e-02, t4 = 1.79706750811820387126e-02, t5 = -1.03142241298341437450e-02, t6 = 6.10053870246291332635e-03, t7 = -3.68452016781138256760e-03, t8 = 2.25964780900612472250e-03, t9 = -1.40346469989232843813e-03, t10 = 8.81081882437654011382e-04, t11 = -5.38595305356740546715e-04, t12 = 3.15632070903625950361e-04, t13 = -3.12754168375120860518e-04, t14 = 3.35529192635519073543e-04, u0 = -7.72156649015328655494e-02, u1 = 6.32827064025093366517e-01, u2 = 1.45492250137234768737e+00, u3 = 9.77717527963372745603e-01, u4 = 2.28963728064692451092e-01, u5 = 1.33810918536787660377e-02, v1 = 2.45597793713041134822e+00, v2 = 2.12848976379893395361e+00, v3 = 7.69285150456672783825e-01, v4 = 1.04222645593369134254e-01, v5 = 3.21709242282423911810e-03, s0 = -7.72156649015328655494e-02, s1 = 2.14982415960608852501e-01, s2 = 3.25778796408930981787e-01, s3 = 1.46350472652464452805e-01, s4 = 2.66422703033638609560e-02, s5 = 1.84028451407337715652e-03, s6 = 3.19475326584100867617e-05, r1 = 1.39200533467621045958e+00, r2 = 7.21935547567138069525e-01, r3 = 1.71933865632803078993e-01, r4 = 1.86459191715652901344e-02, r5 = 7.77942496381893596434e-04, r6 = 7.32668430744625636189e-06, w0 = 4.18938533204672725052e-01, w1 = 8.33333333333329678849e-02, w2 = -2.77777777728775536470e-03, w3 = 7.93650558643019558500e-04, w4 = -5.95187557450339963135e-04, w5 = 8.36339918996282139126e-04, w6 = -1.63092934096575273989e-03; - - /** - * Efficient rounding functions to simplify the log gamma function calculation - * double to long with 32 bit shift - */ - private static final int HI(final double x) { - return (int) (Double.doubleToLongBits(x) >> 32); - } - - /** - * Efficient rounding functions to simplify the log gamma function calculation - * double to long without shift - */ - private static final int LO(final double x) { - return (int) Double.doubleToLongBits(x); - } - - /** - * Most efficent implementation of the lnGamma (FDLIBM) - * Use via the log10Gamma wrapper method. - */ - private static double lnGamma(final double x) { - double t, y, z, p, p1, p2, p3, q, r, w; - int i; - - int hx = HI(x); - int lx = LO(x); - - /* purge off +-inf, NaN, +-0, and negative arguments */ - int ix = hx & 0x7fffffff; - if (ix >= 0x7ff00000) - return Double.POSITIVE_INFINITY; - if ((ix | lx) == 0 || hx < 0) - return Double.NaN; - if (ix < 0x3b900000) { /* |x|<2**-70, return -log(|x|) */ - return -Math.log(x); - } - - /* purge off 1 and 2 */ - if ((((ix - 0x3ff00000) | lx) == 0) || (((ix - 0x40000000) | lx) == 0)) - r = 0; - /* for x < 2.0 */ - else if (ix < 0x40000000) { - if (ix <= 0x3feccccc) { /* lgamma(x) = lgamma(x+1)-log(x) */ - r = -Math.log(x); - if (ix >= 0x3FE76944) { - y = one - x; - i = 0; - } - else if (ix >= 0x3FCDA661) { - y = x - (tc - one); - i = 1; - } - else { - y = x; - i = 2; - } - } - else { - r = zero; - if (ix >= 0x3FFBB4C3) { - y = 2.0 - x; - i = 0; - } /* [1.7316,2] */ - else if (ix >= 0x3FF3B4C4) { - y = x - tc; - i = 1; - } /* [1.23,1.73] */ - else { - y = x - one; - i = 2; - } - } - - switch (i) { - case 0: - z = y * y; - p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10)))); - p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11))))); - p = y * p1 + p2; - r += (p - 0.5 * y); - break; - case 1: - z = y * y; - w = z * y; - p1 = t0 + w * (t3 + w * (t6 + w * (t9 + w * t12))); /* parallel comp */ - p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13))); - p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14))); - p = z * p1 - (tt - w * (p2 + y * p3)); - r += (tf + p); - break; - case 2: - p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5))))); - p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5)))); - r += (-0.5 * y + p1 / p2); - } - } - else if (ix < 0x40200000) { /* x < 8.0 */ - i = (int) x; - t = zero; - y = x - (double) i; - p = y * (s0 + y * (s1 + y * (s2 + y * (s3 + y * (s4 + y * (s5 + y * s6)))))); - q = one + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6))))); - r = half * y + p / q; - z = one; /* lgamma(1+s) = log(s) + lgamma(s) */ - switch (i) { - case 7: - z *= (y + 6.0); /* FALLTHRU */ - case 6: - z *= (y + 5.0); /* FALLTHRU */ - case 5: - z *= (y + 4.0); /* FALLTHRU */ - case 4: - z *= (y + 3.0); /* FALLTHRU */ - case 3: - z *= (y + 2.0); /* FALLTHRU */ - r += Math.log(z); - break; - } - /* 8.0 <= x < 2**58 */ - } - else if (ix < 0x43900000) { - t = Math.log(x); - z = one / x; - y = z * z; - w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6))))); - r = (x - half) * (t - one) + w; - } - else - /* 2**58 <= x <= inf */ - r = x * (Math.log(x) - one); - return r; - } - - /** - * Calculates the log10 of the gamma function for x using the efficient FDLIBM - * implementation to avoid overflows and guarantees high accuracy even for large - * numbers. - * - * @param x the x parameter - * @return the log10 of the gamma function at x. - */ - public static double log10Gamma(final double x) { - return lnToLog10(lnGamma(x)); - } - - public static double factorial(final int x) { - // avoid rounding errors caused by fact that 10^log(x) might be slightly lower than x and flooring may produce 1 less than real value - return (double)Math.round(Math.pow(10, log10Factorial(x))); - } - - public static double log10Factorial(final int x) { - if (x >= log10FactorialCache.length || x < 0) - return log10Gamma(x + 1); - else - return log10FactorialCache[x]; - } - - /** - * Adds two arrays together and returns a new array with the sum. - * - * @param a one array - * @param b another array - * @return a new array with the sum of a and b - */ - @Requires("a.length == b.length") - @Ensures("result.length == a.length") - public static int[] addArrays(final int[] a, final int[] b) { - int[] c = new int[a.length]; - for (int i = 0; i < a.length; i++) - c[i] = a[i] + b[i]; - return c; - } - - /** Same routine, unboxed types for efficiency - * - * @param x First vector - * @param y Second vector - * @return Vector of same length as x and y so that z[k] = x[k]+y[k] - */ - public static double[] vectorSum(final double[]x, final double[] y) { - if (x.length != y.length) - throw new ReviewedStingException("BUG: Lengths of x and y must be the same"); - - double[] result = new double[x.length]; - for (int k=0; k log10LinearRange(final int start, final int stop, final double eps) { - final LinkedList values = new LinkedList<>(); - final double log10range = Math.log10(stop - start); - - if ( start == 0 ) - values.add(0); - - double i = 0.0; - while ( i <= log10range ) { - final int index = (int)Math.round(Math.pow(10, i)) + start; - if ( index < stop && (values.peekLast() == null || values.peekLast() != index ) ) - values.add(index); - i += eps; - } - - if ( values.peekLast() == null || values.peekLast() != stop ) - values.add(stop); - - return values; - } - - /** - * Compute in a numerical correct way the quantity log10(1-x) - * - * Uses the approximation log10(1-x) = log10(1/x - 1) + log10(x) to avoid very quick underflow - * in 1-x when x is very small - * - * @param x a positive double value between 0.0 and 1.0 - * @return an estimate of log10(1-x) - */ - @Requires("x >= 0.0 && x <= 1.0") - @Ensures("result <= 0.0") - public static double log10OneMinusX(final double x) { - if ( x == 1.0 ) - return Double.NEGATIVE_INFINITY; - else if ( x == 0.0 ) - return 0.0; - else { - final double d = Math.log10(1 / x - 1) + Math.log10(x); - return Double.isInfinite(d) || d > 0.0 ? 0.0 : d; - } - } - - /** - * Draw N random elements from list - * @param list - the list from which to draw randomly - * @param N - the number of elements to draw - */ - public static List randomSubset(final List list, final int N) { - if (list.size() <= N) { - return list; - } - - return sliceListByIndices(sampleIndicesWithoutReplacement(list.size(),N),list); - } - - /** - * Return the likelihood of observing the counts of categories having sampled a population - * whose categorial frequencies are distributed according to a Dirichlet distribution - * @param dirichletParams - params of the prior dirichlet distribution - * @param dirichletSum - the sum of those parameters - * @param counts - the counts of observation in each category - * @param countSum - the sum of counts (number of trials) - * @return - associated likelihood - */ - public static double dirichletMultinomial(final double[] dirichletParams, final double dirichletSum, - final int[] counts, final int countSum) { - if ( dirichletParams.length != counts.length ) { - throw new IllegalStateException("The number of dirichlet parameters must match the number of categories"); - } - // todo -- lots of lnGammas here. At some point we can safely switch to x * ( ln(x) - 1) - double likelihood = log10MultinomialCoefficient(countSum,counts); - likelihood += log10Gamma(dirichletSum); - likelihood -= log10Gamma(dirichletSum+countSum); - for ( int idx = 0; idx < counts.length; idx++ ) { - likelihood += log10Gamma(counts[idx] + dirichletParams[idx]); - likelihood -= log10Gamma(dirichletParams[idx]); - } - - return likelihood; - } - - public static double dirichletMultinomial(double[] params, int[] counts) { - return dirichletMultinomial(params,sum(params),counts,(int) sum(counts)); - } - - public static ExponentialDistribution exponentialDistribution( final double mean ) { - return new ExponentialDistributionImpl(mean); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java deleted file mode 100644 index c0d1df09d..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java +++ /dev/null @@ -1,389 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils; - -import com.google.java.contract.Ensures; -import net.sf.samtools.SAMUtils; - -/** - * QualityUtils is a static class (no instantiation allowed!) with some utility methods for manipulating - * quality scores. - * - * @author Kiran Garimella, Mark DePristo - * @since Way back - */ -public class QualityUtils { - /** - * Maximum quality score that can be encoded in a SAM/BAM file - */ - public final static byte MAX_SAM_QUAL_SCORE = SAMUtils.MAX_PHRED_SCORE; - - - private final static double RAW_MIN_PHRED_SCALED_QUAL = Math.log10(Double.MIN_VALUE); - protected final static double MIN_PHRED_SCALED_QUAL = -10.0 * RAW_MIN_PHRED_SCALED_QUAL; - - /** - * bams containing quals above this value are extremely suspicious and we should warn the user - */ - public final static byte MAX_REASONABLE_Q_SCORE = 60; - - /** - * The lowest quality score for a base that is considered reasonable for statistical analysis. This is - * because Q 6 => you stand a 25% of being right, which means all bases are equally likely - */ - public final static byte MIN_USABLE_Q_SCORE = 6; - public final static int MAPPING_QUALITY_UNAVAILABLE = 255; - - /** - * Cached values for qual as byte calculations so they are very fast - */ - private static double qualToErrorProbCache[] = new double[256]; - private static double qualToProbLog10Cache[] = new double[256]; - - - static { - for (int i = 0; i < 256; i++) { - qualToErrorProbCache[i] = qualToErrorProb((double) i); - qualToProbLog10Cache[i] = Math.log10(1.0 - qualToErrorProbCache[i]); - } - } - - /** - * Private constructor. No instantiating this class! - */ - private QualityUtils() {} - - // ---------------------------------------------------------------------- - // - // These are all functions to convert a phred-scaled quality score to a probability - // - // ---------------------------------------------------------------------- - - /** - * Convert a phred-scaled quality score to its probability of being true (Q30 => 0.999) - * - * This is the Phred-style conversion, *not* the Illumina-style conversion. - * - * Because the input is a discretized byte value, this function uses a cache so is very efficient - * - * WARNING -- because this function takes a byte for maxQual, you must be careful in converting - * integers to byte. The appropriate way to do this is ((byte)(myInt & 0xFF)) - * - * @param qual a quality score (0-255) - * @return a probability (0.0-1.0) - */ - @Ensures("result >= 0.0 && result <= 1.0") - public static double qualToProb(final byte qual) { - return 1.0 - qualToErrorProb(qual); - } - - /** - * Convert a phred-scaled quality score to its probability of being true (Q30 => 0.999) - * - * This is the Phred-style conversion, *not* the Illumina-style conversion. - * - * Because the input is a double value, this function must call Math.pow so can be quite expensive - * - * @param qual a phred-scaled quality score encoded as a double. Can be non-integer values (30.5) - * @return a probability (0.0-1.0) - */ - @Ensures("result >= 0.0 && result <= 1.0") - public static double qualToProb(final double qual) { - if ( qual < 0.0 ) throw new IllegalArgumentException("qual must be >= 0.0 but got " + qual); - return 1.0 - qualToErrorProb(qual); - } - - /** - * Convert a phred-scaled quality score to its log10 probability of being true (Q30 => log10(0.999)) - * - * This is the Phred-style conversion, *not* the Illumina-style conversion. - * - * Because the input is a double value, this function must call Math.pow so can be quite expensive - * - * WARNING -- because this function takes a byte for maxQual, you must be careful in converting - * integers to byte. The appropriate way to do this is ((byte)(myInt & 0xFF)) - * - * @param qual a phred-scaled quality score encoded as a double. Can be non-integer values (30.5) - * @return a probability (0.0-1.0) - */ - @Ensures("result <= 0.0") - public static double qualToProbLog10(final byte qual) { - return qualToProbLog10Cache[(int)qual & 0xff]; // Map: 127 -> 127; -128 -> 128; -1 -> 255; etc. - } - - /** - * Convert a phred-scaled quality score to its probability of being wrong (Q30 => 0.001) - * - * This is the Phred-style conversion, *not* the Illumina-style conversion. - * - * Because the input is a double value, this function must call Math.pow so can be quite expensive - * - * @param qual a phred-scaled quality score encoded as a double. Can be non-integer values (30.5) - * @return a probability (0.0-1.0) - */ - @Ensures("result >= 0.0 && result <= 1.0") - public static double qualToErrorProb(final double qual) { - if ( qual < 0.0 ) throw new IllegalArgumentException("qual must be >= 0.0 but got " + qual); - return Math.pow(10.0, qual / -10.0); - } - - /** - * Convert a phred-scaled quality score to its probability of being wrong (Q30 => 0.001) - * - * This is the Phred-style conversion, *not* the Illumina-style conversion. - * - * Because the input is a byte value, this function uses a cache so is very efficient - * - * WARNING -- because this function takes a byte for maxQual, you must be careful in converting - * integers to byte. The appropriate way to do this is ((byte)(myInt & 0xFF)) - * - * @param qual a phred-scaled quality score encoded as a byte - * @return a probability (0.0-1.0) - */ - @Ensures("result >= 0.0 && result <= 1.0") - public static double qualToErrorProb(final byte qual) { - return qualToErrorProbCache[(int)qual & 0xff]; // Map: 127 -> 127; -128 -> 128; -1 -> 255; etc. - } - - - /** - * Convert a phred-scaled quality score to its log10 probability of being wrong (Q30 => log10(0.001)) - * - * This is the Phred-style conversion, *not* the Illumina-style conversion. - * - * The calculation is extremely efficient - * - * WARNING -- because this function takes a byte for maxQual, you must be careful in converting - * integers to byte. The appropriate way to do this is ((byte)(myInt & 0xFF)) - * - * @param qual a phred-scaled quality score encoded as a byte - * @return a probability (0.0-1.0) - */ - @Ensures("result <= 0.0") - public static double qualToErrorProbLog10(final byte qual) { - return qualToErrorProbLog10((double)(qual & 0xFF)); - } - - /** - * Convert a phred-scaled quality score to its log10 probability of being wrong (Q30 => log10(0.001)) - * - * This is the Phred-style conversion, *not* the Illumina-style conversion. - * - * The calculation is extremely efficient - * - * @param qual a phred-scaled quality score encoded as a double - * @return a probability (0.0-1.0) - */ - @Ensures("result <= 0.0") - public static double qualToErrorProbLog10(final double qual) { - if ( qual < 0.0 ) throw new IllegalArgumentException("qual must be >= 0.0 but got " + qual); - return qual / -10.0; - } - - // ---------------------------------------------------------------------- - // - // Functions to convert a probability to a phred-scaled quality score - // - // ---------------------------------------------------------------------- - - /** - * Convert a probability of being wrong to a phred-scaled quality score (0.01 => 20). - * - * Note, this function caps the resulting quality score by the public static value MAX_SAM_QUAL_SCORE - * and by 1 at the low-end. - * - * @param errorRate a probability (0.0-1.0) of being wrong (i.e., 0.01 is 1% change of being wrong) - * @return a quality score (0-MAX_SAM_QUAL_SCORE) - */ - public static byte errorProbToQual(final double errorRate) { - return errorProbToQual(errorRate, MAX_SAM_QUAL_SCORE); - } - - /** - * Convert a probability of being wrong to a phred-scaled quality score (0.01 => 20). - * - * Note, this function caps the resulting quality score by the public static value MIN_REASONABLE_ERROR - * and by 1 at the low-end. - * - * WARNING -- because this function takes a byte for maxQual, you must be careful in converting - * integers to byte. The appropriate way to do this is ((byte)(myInt & 0xFF)) - * - * @param errorRate a probability (0.0-1.0) of being wrong (i.e., 0.01 is 1% change of being wrong) - * @return a quality score (0-maxQual) - */ - public static byte errorProbToQual(final double errorRate, final byte maxQual) { - if ( ! MathUtils.goodProbability(errorRate) ) throw new IllegalArgumentException("errorRate must be good probability but got " + errorRate); - final double d = Math.round(-10.0*Math.log10(errorRate)); - return boundQual((int)d, maxQual); - } - - /** - * @see #errorProbToQual(double, byte) with proper conversion of maxQual integer to a byte - */ - public static byte errorProbToQual(final double prob, final int maxQual) { - if ( maxQual < 0 || maxQual > 255 ) throw new IllegalArgumentException("maxQual must be between 0-255 but got " + maxQual); - return errorProbToQual(prob, (byte)(maxQual & 0xFF)); - } - - /** - * Convert a probability of being right to a phred-scaled quality score (0.99 => 20). - * - * Note, this function caps the resulting quality score by the public static value MAX_SAM_QUAL_SCORE - * and by 1 at the low-end. - * - * @param prob a probability (0.0-1.0) of being right - * @return a quality score (0-MAX_SAM_QUAL_SCORE) - */ - public static byte trueProbToQual(final double prob) { - return trueProbToQual(prob, MAX_SAM_QUAL_SCORE); - } - - /** - * Convert a probability of being right to a phred-scaled quality score (0.99 => 20). - * - * Note, this function caps the resulting quality score by the min probability allowed (EPS). - * So for example, if prob is 1e-6, which would imply a Q-score of 60, and EPS is 1e-4, - * the result of this function is actually Q40. - * - * Note that the resulting quality score, regardless of EPS, is capped by MAX_SAM_QUAL_SCORE and - * bounded on the low-side by 1. - * - * WARNING -- because this function takes a byte for maxQual, you must be careful in converting - * integers to byte. The appropriate way to do this is ((byte)(myInt & 0xFF)) - * - * @param trueProb a probability (0.0-1.0) of being right - * @param maxQual the maximum quality score we are allowed to emit here, regardless of the error rate - * @return a phred-scaled quality score (0-maxQualScore) as a byte - */ - @Ensures("(result & 0xFF) >= 1 && (result & 0xFF) <= (maxQual & 0xFF)") - public static byte trueProbToQual(final double trueProb, final byte maxQual) { - if ( ! MathUtils.goodProbability(trueProb) ) throw new IllegalArgumentException("trueProb must be good probability but got " + trueProb); - final double lp = Math.round(-10.0*MathUtils.log10OneMinusX(trueProb)); - return boundQual((int)lp, maxQual); - } - - /** - * @see #trueProbToQual(double, byte) with proper conversion of maxQual to a byte - */ - public static byte trueProbToQual(final double prob, final int maxQual) { - if ( maxQual < 0 || maxQual > 255 ) throw new IllegalArgumentException("maxQual must be between 0-255 but got " + maxQual); - return trueProbToQual(prob, (byte)(maxQual & 0xFF)); - } - - /** - * Convert a probability of being right to a phred-scaled quality score of being wrong as a double - * - * This is a very generic method, that simply computes a phred-scaled double quality - * score given an error rate. It has the same precision as a normal double operation - * - * @param trueRate the probability of being right (0.0-1.0) - * @return a phred-scaled version of the error rate implied by trueRate - */ - @Ensures("result >= 0.0") - public static double phredScaleCorrectRate(final double trueRate) { - return phredScaleLog10ErrorRate(MathUtils.log10OneMinusX(trueRate)); - } - - /** - * Convert a log10 probability of being right to a phred-scaled quality score of being wrong as a double - * - * This is a very generic method, that simply computes a phred-scaled double quality - * score given an error rate. It has the same precision as a normal double operation - * - * @param trueRateLog10 the log10 probability of being right (0.0-1.0). Can be -Infinity to indicate - * that the result is impossible in which MIN_PHRED_SCALED_QUAL is returned - * @return a phred-scaled version of the error rate implied by trueRate - */ - @Ensures("result >= 0.0") - public static double phredScaleLog10CorrectRate(final double trueRateLog10) { - return phredScaleCorrectRate(Math.pow(10.0, trueRateLog10)); - } - - /** - * Convert a probability of being wrong to a phred-scaled quality score of being wrong as a double - * - * This is a very generic method, that simply computes a phred-scaled double quality - * score given an error rate. It has the same precision as a normal double operation - * - * @param errorRate the probability of being wrong (0.0-1.0) - * @return a phred-scaled version of the error rate - */ - @Ensures("result >= 0.0") - public static double phredScaleErrorRate(final double errorRate) { - return phredScaleLog10ErrorRate(Math.log10(errorRate)); - } - - /** - * Convert a log10 probability of being wrong to a phred-scaled quality score of being wrong as a double - * - * This is a very generic method, that simply computes a phred-scaled double quality - * score given an error rate. It has the same precision as a normal double operation - * - * @param errorRateLog10 the log10 probability of being wrong (0.0-1.0). Can be -Infinity, in which case - * the result is MIN_PHRED_SCALED_QUAL - * @return a phred-scaled version of the error rate - */ - @Ensures("result >= 0.0") - public static double phredScaleLog10ErrorRate(final double errorRateLog10) { - if ( ! MathUtils.goodLog10Probability(errorRateLog10) ) throw new IllegalArgumentException("errorRateLog10 must be good probability but got " + errorRateLog10); - // abs is necessary for edge base with errorRateLog10 = 0 producing -0.0 doubles - return Math.abs(-10.0 * Math.max(errorRateLog10, RAW_MIN_PHRED_SCALED_QUAL)); - } - - // ---------------------------------------------------------------------- - // - // Routines to bound a quality score to a reasonable range - // - // ---------------------------------------------------------------------- - - /** - * Return a quality score that bounds qual by MAX_SAM_QUAL_SCORE and 1 - * - * @param qual the uncapped quality score as an integer - * @return the bounded quality score - */ - @Ensures("(result & 0xFF) >= 1 && (result & 0xFF) <= (MAX_SAM_QUAL_SCORE & 0xFF)") - public static byte boundQual(int qual) { - return boundQual(qual, MAX_SAM_QUAL_SCORE); - } - - /** - * Return a quality score that bounds qual by maxQual and 1 - * - * WARNING -- because this function takes a byte for maxQual, you must be careful in converting - * integers to byte. The appropriate way to do this is ((byte)(myInt & 0xFF)) - * - * @param qual the uncapped quality score as an integer. Can be < 0 (which may indicate an error in the - * client code), which will be brought back to 1, but this isn't an error, as some - * routines may use this functionality (BaseRecalibrator, for example) - * @param maxQual the maximum quality score, must be less < 255 - * @return the bounded quality score - */ - @Ensures("(result & 0xFF) >= 1 && (result & 0xFF) <= (maxQual & 0xFF)") - public static byte boundQual(final int qual, final byte maxQual) { - return (byte) (Math.max(Math.min(qual, maxQual & 0xFF), 1) & 0xFF); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/SimpleTimer.java b/public/java/src/org/broadinstitute/sting/utils/SimpleTimer.java deleted file mode 100644 index 69a2f0c8e..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/SimpleTimer.java +++ /dev/null @@ -1,182 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils; - - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; - -import java.util.concurrent.TimeUnit; - -/** - * A useful simple system for timing code with nano second resolution - * - * Note that this code is not thread-safe. If you have a single timer - * being started and stopped by multiple threads you will need to protect the - * calls to avoid meaningless results of having multiple starts and stops - * called sequentially. - * - * User: depristo - * Date: Dec 10, 2010 - * Time: 9:07:44 AM - */ -public class SimpleTimer { - protected static final double NANO_TO_SECOND_DOUBLE = 1.0 / TimeUnit.SECONDS.toNanos(1); - private final String name; - - /** - * The elapsedTimeNano time in nanoSeconds of this timer. The elapsedTimeNano time is the - * sum of times between starts/restrats and stops. - */ - private long elapsedTimeNano = 0l; - - /** - * The start time of the last start/restart in nanoSeconds - */ - private long startTimeNano = 0l; - - /** - * Is this timer currently running (i.e., the last call was start/restart) - */ - private boolean running = false; - - /** - * Creates an anonymous simple timer - */ - public SimpleTimer() { - this("Anonymous"); - } - - /** - * Creates a simple timer named name - * @param name of the timer, must not be null - */ - public SimpleTimer(final String name) { - if ( name == null ) throw new IllegalArgumentException("SimpleTimer name cannot be null"); - this.name = name; - } - - /** - * @return the name associated with this timer - */ - public synchronized String getName() { - return name; - } - - /** - * Starts the timer running, and sets the elapsedTimeNano time to 0. This is equivalent to - * resetting the time to have no history at all. - * - * @return this object, for programming convenience - */ - @Ensures("elapsedTimeNano == 0l") - public synchronized SimpleTimer start() { - elapsedTimeNano = 0l; - return restart(); - } - - /** - * Starts the timer running, without resetting the elapsedTimeNano time. This function may be - * called without first calling start(). The only difference between start and restart - * is that start resets the elapsedTimeNano time, while restart does not. - * - * @return this object, for programming convenience - */ - public synchronized SimpleTimer restart() { - running = true; - startTimeNano = currentTimeNano(); - return this; - } - - /** - * @return is this timer running? - */ - public synchronized boolean isRunning() { - return running; - } - - /** - * @return A convenience function to obtain the current time in milliseconds from this timer - */ - public long currentTime() { - return System.currentTimeMillis(); - } - - /** - * @return A convenience function to obtain the current time in nanoSeconds from this timer - */ - public long currentTimeNano() { - return System.nanoTime(); - } - - /** - * Stops the timer. Increases the elapsedTimeNano time by difference between start and now. - * - * It's ok to call stop on a timer that's not running. It has no effect on the timer. - * - * @return this object, for programming convenience - */ - @Requires("startTimeNano != 0l") - public synchronized SimpleTimer stop() { - if ( running ) { - running = false; - elapsedTimeNano += currentTimeNano() - startTimeNano; - } - return this; - } - - /** - * Returns the total elapsedTimeNano time of all start/stops of this timer. If the timer is currently - * running, includes the difference from currentTime() and the start as well - * - * @return this time, in seconds - */ - public synchronized double getElapsedTime() { - return nanoToSecondsAsDouble(getElapsedTimeNano()); - } - - protected static double nanoToSecondsAsDouble(final long nano) { - return nano * NANO_TO_SECOND_DOUBLE; - } - - /** - * @see #getElapsedTime() but returns the result in nanoseconds - * - * @return the elapsed time in nanoseconds - */ - public synchronized long getElapsedTimeNano() { - return running ? (currentTimeNano() - startTimeNano + elapsedTimeNano) : elapsedTimeNano; - } - - /** - * Add the elapsed time from toAdd to this elapsed time - * - * @param toAdd the timer whose elapsed time we want to add to this timer - */ - public synchronized void addElapsed(final SimpleTimer toAdd) { - elapsedTimeNano += toAdd.getElapsedTimeNano(); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java deleted file mode 100644 index 8f6af0158..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java +++ /dev/null @@ -1,466 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.activeregion; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Invariant; -import net.sf.picard.reference.IndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; -import org.broadinstitute.sting.utils.HasGenomeLocation; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -import java.util.*; - -/** - * Represents a single active region created by the Active Region Traversal for processing - * - * An active region is a single contiguous span of bases on the genome that should be operated - * on as a single unit for the active region traversal. The action may contains a list of - * reads that overlap the region (may because there may be no reads in the region). The region - * is tagged as being either active or inactive, depending on the probabilities provided by - * the isActiveProb results from the ART walker. Each region carries with it the - * exact span of the region (bases which are the core of the isActiveProbs from the walker) as - * well as an extended size, that includes the ART walker's extension size. Reads in the region - * provided by ART include all reads overlapping the extended span, not the raw span. - * - * User: rpoplin - * Date: 1/4/12 - */ -@Invariant({ - "extension >= 0", - "activeRegionLoc != null", - "genomeLocParser != null", - "spanIncludingReads != null", - "extendedLoc != null" -}) -public class ActiveRegion implements HasGenomeLocation { - /** - * The reads included in this active region. May be empty upon creation, and expand / contract - * as reads are added or removed from this region. - */ - private final List reads = new ArrayList(); - - /** - * An ordered list (by genomic coordinate) of the ActivityProfileStates that went - * into this active region. May be empty, which says that no supporting states were - * provided when this region was created. - */ - private final List supportingStates; - - /** - * The raw span of this active region, not including the active region extension - */ - private final GenomeLoc activeRegionLoc; - - /** - * The span of this active region on the genome, including the active region extension - */ - private final GenomeLoc extendedLoc; - - /** - * The extension, in bp, of this active region. - */ - private final int extension; - - /** - * A genomeLocParser so we can create genomeLocs - */ - private final GenomeLocParser genomeLocParser; - - /** - * Does this region represent an active region (all isActiveProbs above threshold) or - * an inactive region (all isActiveProbs below threshold)? - */ - private final boolean isActive; - - /** - * The span of this active region, including the bp covered by all reads in this - * region. This union of extensionLoc and the loc of all reads in this region. - * - * Must be at least as large as extendedLoc, but may be larger when reads - * partially overlap this region. - */ - private GenomeLoc spanIncludingReads; - - - /** - * Indicates whether the active region has been finalized - */ - private boolean hasBeenFinalized; - - /** - * Create a new ActiveRegion containing no reads - * - * @param activeRegionLoc the span of this active region - * @param supportingStates the states that went into creating this region, or null / empty if none are available. - * If not empty, must have exactly one state for each bp in activeRegionLoc - * @param isActive indicates whether this is an active region, or an inactve one - * @param genomeLocParser a non-null parser to let us create new genome locs - * @param extension the active region extension to use for this active region - */ - public ActiveRegion( final GenomeLoc activeRegionLoc, final List supportingStates, final boolean isActive, final GenomeLocParser genomeLocParser, final int extension ) { - if ( activeRegionLoc == null ) throw new IllegalArgumentException("activeRegionLoc cannot be null"); - if ( activeRegionLoc.size() == 0 ) throw new IllegalArgumentException("Active region cannot be of zero size, but got " + activeRegionLoc); - if ( genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser cannot be null"); - if ( extension < 0 ) throw new IllegalArgumentException("extension cannot be < 0 but got " + extension); - - this.activeRegionLoc = activeRegionLoc; - this.supportingStates = supportingStates == null ? Collections.emptyList() : new ArrayList(supportingStates); - this.isActive = isActive; - this.genomeLocParser = genomeLocParser; - this.extension = extension; - this.extendedLoc = genomeLocParser.createGenomeLocOnContig(activeRegionLoc.getContig(), activeRegionLoc.getStart() - extension, activeRegionLoc.getStop() + extension); - this.spanIncludingReads = extendedLoc; - - if ( ! this.supportingStates.isEmpty() ) { - if ( this.supportingStates.size() != activeRegionLoc.size() ) - throw new IllegalArgumentException("Supporting states wasn't empty but it doesn't have exactly one state per bp in the active region: states " + this.supportingStates.size() + " vs. bp in region = " + activeRegionLoc.size()); - GenomeLoc lastStateLoc = null; - for ( final ActivityProfileState state : this.supportingStates ) { - if ( lastStateLoc != null ) { - if ( state.getLoc().getStart() != lastStateLoc.getStart() + 1 || state.getLoc().getContigIndex() != lastStateLoc.getContigIndex()) - throw new IllegalArgumentException("Supporting state has an invalid sequence: last state was " + lastStateLoc + " but next state was " + state); - } - lastStateLoc = state.getLoc(); - } - } - } - - /** - * Simple interface to create an active region that isActive without any profile state - */ - public ActiveRegion( final GenomeLoc activeRegionLoc, final GenomeLocParser genomeLocParser, final int extension ) { - this(activeRegionLoc, Collections.emptyList(), true, genomeLocParser, extension); - } - - @Override - public String toString() { - return "ActiveRegion " + activeRegionLoc.toString() + " active?=" + isActive() + " nReads=" + reads.size(); - } - - /** - * See #getActiveRegionReference but with padding == 0 - */ - public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader ) { - return getActiveRegionReference(referenceReader, 0); - } - - /** - * Get the reference bases from referenceReader spanned by the extended location of this active region, - * including additional padding bp on either side. If this expanded region would exceed the boundaries - * of the active region's contig, the returned result will be truncated to only include on-genome reference - * bases - * @param referenceReader the source of the reference genome bases - * @param padding the padding, in BP, we want to add to either side of this active region extended region - * @return a non-null array of bytes holding the reference bases in referenceReader - */ - @Ensures("result != null") - public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader, final int padding ) { - return getReference(referenceReader, padding, extendedLoc); - } - - /** - * See #getActiveRegionReference but using the span including regions not the extended span - */ - public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader ) { - return getFullReference(referenceReader, 0); - } - - /** - * See #getActiveRegionReference but using the span including regions not the extended span - */ - public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader, final int padding ) { - return getReference(referenceReader, padding, spanIncludingReads); - } - - /** - * Get the reference bases from referenceReader spanned by the extended location of this active region, - * including additional padding bp on either side. If this expanded region would exceed the boundaries - * of the active region's contig, the returned result will be truncated to only include on-genome reference - * bases - * @param referenceReader the source of the reference genome bases - * @param padding the padding, in BP, we want to add to either side of this active region extended region - * @param genomeLoc a non-null genome loc indicating the base span of the bp we'd like to get the reference for - * @return a non-null array of bytes holding the reference bases in referenceReader - */ - @Ensures("result != null") - public byte[] getReference( final IndexedFastaSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) { - if ( referenceReader == null ) throw new IllegalArgumentException("referenceReader cannot be null"); - if ( padding < 0 ) throw new IllegalArgumentException("padding must be a positive integer but got " + padding); - if ( genomeLoc == null ) throw new IllegalArgumentException("genomeLoc cannot be null"); - if ( genomeLoc.size() == 0 ) throw new IllegalArgumentException("GenomeLoc must have size > 0 but got " + genomeLoc); - - final byte[] reference = referenceReader.getSubsequenceAt( genomeLoc.getContig(), - Math.max(1, genomeLoc.getStart() - padding), - Math.min(referenceReader.getSequenceDictionary().getSequence(genomeLoc.getContig()).getSequenceLength(), genomeLoc.getStop() + padding) ).getBases(); - - return reference; - } - - /** - * Get the raw span of this active region (excluding the extension) - * @return a non-null genome loc - */ - @Override - @Ensures("result != null") - public GenomeLoc getLocation() { return activeRegionLoc; } - - /** - * Get the span of this active region including the extension value - * @return a non-null GenomeLoc - */ - @Ensures("result != null") - public GenomeLoc getExtendedLoc() { return extendedLoc; } - - /** - * Get the span of this active region including the extension and the projects on the - * genome of all reads in this active region. That is, returns the bp covered by this - * region and all reads in the region. - * @return a non-null genome loc - */ - @Ensures("result != null") - public GenomeLoc getReadSpanLoc() { return spanIncludingReads; } - - /** - * Get the active profile states that went into creating this region, if possible - * @return an unmodifiable list of states that led to the creation of this region, or an empty - * list if none were provided - */ - @Ensures("result != null") - public List getSupportingStates() { - return Collections.unmodifiableList(supportingStates); - } - - /** - * Get the active region extension applied to this region - * - * The extension is >= 0 bp in size, and indicates how much padding this art walker wanted for its regions - * - * @return the size in bp of the region extension - */ - @Ensures("result >= 0") - public int getExtension() { return extension; } - - /** - * Get an unmodifiable list of reads currently in this active region. - * - * The reads are sorted by their coordinate position - * - * @return an unmodifiable list of reads in this active region - */ - @Ensures("result != null") - public List getReads() { - return Collections.unmodifiableList(reads); - } - - /** - * Get the number of reads currently in this active region - * @return an integer >= 0 - */ - @Ensures("result >= 0") - public int size() { return reads.size(); } - - /** - * Add read to this active region - * - * Read must have alignment start >= than the last read currently in this active region. - * - * @throws IllegalArgumentException if read doesn't overlap the extended region of this active region - * - * @param read a non-null GATKSAMRecord - */ - @Ensures("reads.size() == old(reads.size()) + 1") - public void add( final GATKSAMRecord read ) { - if ( read == null ) throw new IllegalArgumentException("Read cannot be null"); - - final GenomeLoc readLoc = genomeLocParser.createGenomeLoc( read ); - if ( ! readOverlapsRegion(read) ) - throw new IllegalArgumentException("Read location " + readLoc + " doesn't overlap with active region extended span " + extendedLoc); - - spanIncludingReads = spanIncludingReads.union( readLoc ); - - if ( ! reads.isEmpty() ) { - final GATKSAMRecord lastRead = reads.get(size() - 1); - if ( ! lastRead.getReferenceIndex().equals(read.getReferenceIndex()) ) - throw new IllegalArgumentException("Attempting to add a read to ActiveRegion not on the same contig as other reads: lastRead " + lastRead + " attempting to add " + read); - - if ( read.getAlignmentStart() < lastRead.getAlignmentStart() ) - throw new IllegalArgumentException("Attempting to add a read to ActiveRegion out of order w.r.t. other reads: lastRead " + lastRead + " at " + lastRead.getAlignmentStart() + " attempting to add " + read + " at " + read.getAlignmentStart()); - } - - reads.add( read ); - } - - /** - * Returns true if read would overlap the extended extent of this region - * @param read the read we want to test - * @return true if read can be added to this region, false otherwise - */ - public boolean readOverlapsRegion(final GATKSAMRecord read) { - final GenomeLoc readLoc = genomeLocParser.createGenomeLoc( read ); - return readLoc.overlapsP(extendedLoc); - } - - /** - * Add all reads to this active region - * @param reads a collection of reads to add to this active region - */ - public void addAll(final Collection reads) { - if ( reads == null ) throw new IllegalArgumentException("reads cannot be null"); - for ( final GATKSAMRecord read : reads ) - add(read); - } - - /** - * Clear all of the reads currently in this active region - */ - @Ensures("size() == 0") - public void clearReads() { - spanIncludingReads = extendedLoc; - reads.clear(); - } - - /** - * Remove all of the reads in readsToRemove from this active region - * @param readsToRemove the set of reads we want to remove - */ - public void removeAll( final Set readsToRemove ) { - final Iterator it = reads.iterator(); - spanIncludingReads = extendedLoc; - while ( it.hasNext() ) { - final GATKSAMRecord read = it.next(); - if ( readsToRemove.contains(read) ) - it.remove(); - else - spanIncludingReads = spanIncludingReads.union( genomeLocParser.createGenomeLoc(read) ); - } - } - - /** - * Is this region equal to other, excluding any reads in either region in the comparison - * @param other the other active region we want to test - * @return true if this region is equal, excluding any reads and derived values, to other - */ - protected boolean equalExceptReads(final ActiveRegion other) { - if ( activeRegionLoc.compareTo(other.activeRegionLoc) != 0 ) return false; - if ( isActive() != other.isActive()) return false; - if ( genomeLocParser != other.genomeLocParser ) return false; - if ( extension != other.extension ) return false; - if ( extendedLoc.compareTo(other.extendedLoc) != 0 ) return false; - return true; - } - - /** - * Does this region represent an active region (all isActiveProbs above threshold) or - * an inactive region (all isActiveProbs below threshold)? - */ - public boolean isActive() { - return isActive; - } - - /** - * Intersect this active region with the allowed intervals, returning a list of active regions - * that only contain locations present in intervals - * - * Note that the returned list may be empty, if this active region doesn't overlap the set at all - * - * Note that the resulting regions are all empty, regardless of whether the current active region has reads - * - * @param intervals a non-null set of intervals that are allowed - * @return an ordered list of active region where each interval is contained within intervals - */ - @Ensures("result != null") - protected List splitAndTrimToIntervals(final GenomeLocSortedSet intervals) { - final List allOverlapping = intervals.getOverlapping(getLocation()); - final List clippedRegions = new LinkedList(); - - for ( final GenomeLoc overlapping : allOverlapping ) { - clippedRegions.add(trim(overlapping, extension)); - } - - return clippedRegions; - } - - /** - * Trim this active to just the newExtent, producing a new active region without any reads that has only - * the extent of newExtend intersected with the current extent - * @param newExtent the new extend of the active region we want - * @param newExtension the extension size we want for the newly trimmed active region - * @return a non-null, empty active region - */ - public ActiveRegion trim(final GenomeLoc newExtent, final int newExtension) { - if ( newExtent == null ) throw new IllegalArgumentException("Active region extent cannot be null"); - - final GenomeLoc subLoc = getLocation().intersect(newExtent); - final int subStart = subLoc.getStart() - getLocation().getStart(); - final int subEnd = subStart + subLoc.size(); - final List subStates = supportingStates.isEmpty() ? supportingStates : supportingStates.subList(subStart, subEnd); - return new ActiveRegion( subLoc, subStates, isActive, genomeLocParser, newExtension ); - } - - /** - * Trim this active to no more than the newExtent, producing a new active region without any reads that - * attempts to provide the best possible representation of this active region covering the newExtent. - * - * The challenge here is that newExtent may (1) be larger than can be represented by this active region - * + its original extension and (2) the extension must be symmetric on both sides. This algorithm - * therefore determines how best to represent newExtent as a subset of the span of this - * region with a padding value that captures as much of the newExtent as possible. - * - * For example, suppose this active region is - * - * Active: 100-200 with extension of 50, so that the true span is 50-250 - * NewExtent: 150-225 saying that we'd ideally like to just have bases 150-225 - * - * Here we represent the active region as a active region from 150-200 with 25 bp of padding. - * - * The overall constraint is that the active region can never exceed the original active region, and - * the extension is chosen to maximize overlap with the desired region - * - * @param newExtent the new extend of the active region we want - * @return a non-null, empty active region - */ - public ActiveRegion trim(final GenomeLoc newExtent) { - if ( newExtent == null ) throw new IllegalArgumentException("Active region extent cannot be null"); - - final GenomeLoc subActive = getLocation().intersect(newExtent); - final int requiredOnRight = Math.max(newExtent.getStop() - subActive.getStop(), 0); - final int requiredOnLeft = Math.max(subActive.getStart() - newExtent.getStart(), 0); - final int requiredExtension = Math.min(Math.max(requiredOnLeft, requiredOnRight), getExtension()); - - return new ActiveRegion( subActive, Collections.emptyList(), isActive, genomeLocParser, requiredExtension ); - } - - public void setFinalized(final boolean value) { - hasBeenFinalized = value; - } - - public boolean isFinalized() { - return hasBeenFinalized; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java deleted file mode 100644 index 34705c4c9..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java +++ /dev/null @@ -1,277 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.codecs.sampileup; - -import org.broad.tribble.AsciiFeatureCodec; -import org.broad.tribble.exception.CodecLineParsingException; -import org.broad.tribble.readers.LineIterator; -import org.broad.tribble.util.ParsingUtils; - -import java.util.ArrayList; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import static org.broadinstitute.sting.utils.codecs.sampileup.SAMPileupFeature.VariantType; - -/** - * Decoder for SAM pileup data. For GATK validation purposes only - * - *

- * Pileup format is first used by Tony Cox and Zemin Ning at the Sanger Institute. - * It desribes the base-pair information at each chromosomal position. This format - * facilitates SNP/indel calling and brief alignment viewing by eyes. - *

- *

- * Each line consists of chromosome, 1-based coordinate, reference base, the - * number of reads covering the site, read bases and base qualities. At the - * read base column, a dot stands for a match to the reference base on the - * forward strand, a comma for a match on the reverse strand, `ACGTN' for a mismatch - * on the forward strand and `acgtn' for a mismatch on the reverse strand. - * A pattern `\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion between - * this reference position and the next reference position. The length of the - * insertion is given by the integer in the pattern, followed by the inserted sequence. - *

- * - *

- *
See also: @see SAMTools project
- *
See also: @see Pileup format
- *

- * - *

File format example

- *
- *     seq1 272 T 24  ,.$.....,,.,.,...,,,.,..^+. <<<+;<<<<<<<<<<<=<;<;7<&
- *     seq1 273 T 23  ,.....,,.,.,...,,,.,..A <<<;<<<<<<<<<3<=<<<;<<+
- *     seq1 274 T 23  ,.$....,,.,.,...,,,.,...    7<7;<;<<<<<<<<<=<;<;<<6
- *     seq1 275 A 23  ,$....,,.,.,...,,,.,...^l.  <+;9*<<<<<<<<<=<<:;<<<<
- *     seq1 276 G 22  ...T,,.,.,...,,,.,....  33;+<<7=7<<7<&<<1;<<6<
- *     seq1 277 T 22  ....,,.,.,.C.,,,.,..G.  +7<;<<<<<<<&<=<<:;<<&<
- *     seq1 278 G 23  ....,,.,.,...,,,.,....^k.   %38*<<;<7<<7<=<<<;<<<<<
- *     seq1 279 C 23  A..T,,.,.,...,,,.,..... ;75&<<<<<<<<<=<<<9<<:<<
- * 
- * - * @author Matt Hanna - * @since 2009 - */ -public class SAMPileupCodec extends AsciiFeatureCodec { - // the number of tokens we expect to parse from a pileup line - private static final int expectedTokenCount = 10; - private static final char fldDelim = '\t'; - - // allocate once and don't ever bother creating them again: - private static final String baseA = "A"; - private static final String baseC = "C"; - private static final String baseG = "G"; - private static final String baseT = "T"; - private static final String emptyStr = ""; // we will use this for "reference" allele in insertions - - public SAMPileupCodec() { - super(SAMPileupFeature.class); - } - - public SAMPileupFeature decode(String line) { -// 0 1 2 3 4 5 6 7 -//* chrX 466 T Y 170 170 88 32 ... (piles of read bases and quals follow) -//* chrX 141444 * +CA/+CA 32 468 255 25 +CA * 5 2 12 6 - String[] tokens = new String[expectedTokenCount]; - - // split the line - int count = ParsingUtils.split(line,tokens,fldDelim); - - // check to see if we've parsed the string into the right number of tokens (expectedTokenCount) - if (count != expectedTokenCount) - throw new CodecLineParsingException("the SAM pileup line didn't have the expected number of tokens " + - "(expected = " + expectedTokenCount + ", saw = " + count + " on " + - "line = " + line + ")"); - - SAMPileupFeature feature = new SAMPileupFeature(); - - feature.setChr(tokens[0]); - feature.setStart(Integer.parseInt(tokens[1])); - - if(tokens[2].length() != 1) - throw new CodecLineParsingException("The SAM pileup line had unexpected base " + tokens[2] + " on line = " + line); - feature.setRef(Character.toUpperCase(tokens[2].charAt(0))); - - String observedString = tokens[3].toUpperCase(); // field 3 - feature.setFWDAlleles(new ArrayList(2)); - - feature.setConsensusConfidence(Double.parseDouble(tokens[4])); - feature.setVariantConfidence(Double.parseDouble(tokens[5])); - - if ( feature.getRef() == '*' ) { - parseIndels(observedString,feature) ; - if ( feature.isDeletion() ) feature.setEnd(feature.getStart()+feature.length()-1); - else feature.setEnd(feature.getStart()); // if it's not a deletion and we are biallelic, this got to be an insertion; otherwise the state is inconsistent!!!! - } else { - parseBasesAndQuals(feature,tokens[8],tokens[9]); - // if the variant is a SNP or a reference base (i.e. no variant at all) - if ( observedString.length() != 1 ) throw new RuntimeException( "point mutation genotype is expected to be represented by a single letter"); - feature.setRefBases(tokens[2].toUpperCase()); - feature.setEnd(feature.getStart()); - - char ch = observedString.charAt(0); - - switch ( ch ) { - case 'A': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseA); break; - case 'C': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseC); break; - case 'G': feature.getFWDAlleles().add(baseG); feature.getFWDAlleles().add(baseG); break; - case 'T': feature.getFWDAlleles().add(baseT); feature.getFWDAlleles().add(baseT); break; - case 'M': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseC); break; - case 'R': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseG); break; - case 'W': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseT); break; - case 'S': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseG); break; - case 'Y': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseT); break; - case 'K': feature.getFWDAlleles().add(baseG); feature.getFWDAlleles().add(baseT); break; - } - if ( feature.getFWDAlleles().get(0).charAt(0) == feature.getRef() && feature.getFWDAlleles().get(1).charAt(0) == feature.getRef() ) feature.setVariantType(VariantType.NONE); - else { - // we know that at least one allele is non-ref; - // if one is ref and the other is non-ref, or if both are non ref but they are the same (i.e. - // homozygous non-ref), we still have 2 allelic variants at the site (e.g. one ref and one nonref) - feature.setVariantType(VariantType.SNP); - if ( feature.getFWDAlleles().get(0).charAt(0) == feature.getRef() || - feature.getFWDAlleles().get(1).charAt(0) == feature.getRef() || - feature.getFWDAlleles().get(0).equals(feature.getFWDAlleles().get(1)) - ) feature.setNumNonRef(1); - else feature.setNumNonRef(2); // if both observations differ from ref and they are not equal to one another, then we get multiallelic site... - } - } - - return feature; - } - - @Override - public Object readActualHeader(LineIterator lineIterator) { - // No header for this format - return null; - } - - private void parseIndels(String genotype,SAMPileupFeature feature) { - String [] obs = genotype.split("/"); // get observations, now need to tinker with them a bit - - // if reference allele is among the observed alleles, we will need to take special care of it since we do not have direct access to the reference; - // if we have an insertion, the "reference" allele is going to be empty; if it it is a deletion, we will deduce the "reference allele" bases - // from what we have recorded for the deletion allele (e.g. "-CAC") - boolean hasRefAllele = false; - - for ( int i = 0 ; i < obs.length ; i++ ) { - if ( obs[i].length() == 1 && obs[i].charAt(0) == '*' ) { - hasRefAllele = true; - feature.getFWDAlleles().add(emptyStr); - continue; - } - - String varBases = obs[i].toUpperCase(); - - switch ( obs[i].charAt(0) ) { - case '+': - if (!feature.isReference() && !feature.isInsertion()) feature.setVariantType(VariantType.INDEL); - else feature.setVariantType(VariantType.INSERTION); - feature.setRefBases(emptyStr); - break; - case '-' : - if (!feature.isReference() && !feature.isDeletion()) feature.setVariantType(VariantType.INDEL); - else feature.setVariantType(VariantType.DELETION); - feature.setRefBases(varBases); // remember what was deleted, this will be saved as "reference allele" - break; - default: throw new RuntimeException("Can not interpret observed indel allele record: "+genotype); - } - feature.getFWDAlleles().add(varBases); - feature.setLength(obs[i].length()-1); // inconsistent for non-biallelic indels!! - } - if ( hasRefAllele ) { - // we got at least one ref. allele (out of two recorded) - if (feature.isReference()) { // both top theories are actually ref allele; - feature.setNumNonRef(0); // no observations of non-reference allele at all - feature.setRefBases(emptyStr); - } else { - feature.setNumNonRef(1); // hasRefAllele = true, so one allele was definitely ref, hence there is only one left - } - } else { - // we observe two non-ref alleles; they better be the same variant, otherwise the site is not bi-allelic and at the moment we - // fail to set data in a consistent way. - if ( feature.getFWDAlleles().get(0).equals(feature.getFWDAlleles().get(1))) feature.setNumNonRef(1); - else feature.setNumNonRef(2); - } - // DONE with indels - - } - - private void parseBasesAndQuals(SAMPileupFeature feature, final String bases, final String quals) - { - //System.out.printf("%s%n%s%n", bases, quals); - - // needs to convert the base string with it's . and , to the ref base - StringBuilder baseBuilder = new StringBuilder(); - StringBuilder qualBuilder = new StringBuilder(); - boolean done = false; - for ( int i = 0, j = 0; i < bases.length() && ! done; i++ ) { - //System.out.printf("%d %d%n", i, j); - char c = (char)bases.charAt(i); - - switch ( c ) { - case '.': // matches reference - case ',': // matches reference - baseBuilder.append(feature.getRef()); - qualBuilder.append(quals.charAt(j++)); - break; - case '$': // end of read - break; - case '*': // end of indel? - j++; - break; - case '^': // mapping quality - i++; - break; - case '+': // start of indel - case '-': // start of indel - final Pattern regex = Pattern.compile("([0-9]+).*"); // matches case 1 - final String rest = bases.substring(i+1); - //System.out.printf("sub is %s%n", rest); - Matcher match = regex.matcher(rest); - if ( ! match.matches() ) { - if ( feature.getRef() != '*' ) - throw new RuntimeException("Bad pileup format: " + bases + " at position " + i); - done = true; - } - else { - String g = match.group(1); - //System.out.printf("group is %d, match is %s%n", match.groupCount(), g); - int l = Integer.parseInt(g); - i += l + g.length(); // length of number + that many bases + +/- at the start (included in the next i++) - //System.out.printf("remaining is %d => %s%n", l, bases.substring(i+1)); - } - break; - default: // non reference base - baseBuilder.append(c); - qualBuilder.append(quals.charAt(j++)); - } - } - - feature.setPileupBases(baseBuilder.toString()); - feature.setPileupQuals(qualBuilder.toString()); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupFeature.java b/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupFeature.java deleted file mode 100644 index a6fd996fd..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupFeature.java +++ /dev/null @@ -1,272 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.codecs.sampileup; - -import net.sf.samtools.util.StringUtil; -import org.broad.tribble.Feature; - -import java.util.List; - -/** - * A tribble feature representing a SAM pileup. - * - * @author mhanna - * @version 0.1 - */ -public class SAMPileupFeature implements Feature { - public enum VariantType { NONE, SNP, INSERTION, DELETION, INDEL }; - - private String contig; // genomic location of this genotyped site - private int start; - private int stop; - - private char refBaseChar; // what we have set for the reference base (is set to a '*' for indel!) - private String refBases; // the reference base sequence according to NCBI; single base for point mutations, deleted bases for deletions, empty string for insertions - - private String pileupQuals; // the read base qualities - private String pileupBases; // the read bases themselves - - private List observedAlleles = null; // The sequences of the observed alleles (e.g. {"A","C"} for point mutation or {"","+CC"} for het. insertion - private VariantType varType = VariantType.NONE; - private int nNonref = 0; // number of non-reference alleles observed - private int eventLength = 0; // number of inserted or deleted bases - - private double consensusScore = 0; - private double variantScore = 0; - - /** - * create the pileup feature. Default protection so that only other classes in this package can create it. - */ - SAMPileupFeature() {} - - public String getChr() { - return contig; - } - - protected void setChr(String chr) { - this.contig = chr; - } - - public int getStart() { - return start; - } - - protected void setStart(int start) { - this.start = start; - } - - public int getEnd() { - return stop; - } - - protected void setEnd(int end) { - this.stop = end; - } - - public String getQualsAsString() { return pileupQuals; } - - protected void setPileupQuals(String pileupQuals) { - this.pileupQuals = pileupQuals; - } - - /** Returns reference base for point genotypes or '*' for indel genotypes, as a char. - * - */ - public char getRef() { return refBaseChar; } - - protected void setRef(char ref) { - this.refBaseChar = ref; - } - - public int size() { return pileupQuals.length(); } - - /** Returns pile of observed bases over the current genomic location. - * - */ - public String getBasesAsString() { return pileupBases; } - - protected void setPileupBases(String pileupBases) { - this.pileupBases = pileupBases; - } - - /** Returns formatted pileup string for the current genomic location as - * "location: reference_base observed_base_pile observed_qual_pile" - */ - public String getPileupString() - { - if(start == stop) - return String.format("%s:%d: %s %s %s", getChr(), getStart(), getRef(), getBasesAsString(), getQualsAsString()); - else - return String.format("%s:%d-%d: %s %s %s", getChr(), getStart(), getEnd(), getRef(), getBasesAsString(), getQualsAsString()); - } - - /** - * Gets the bases in byte array form. - * @return byte array of the available bases. - */ - public byte[] getBases() { - return StringUtil.stringToBytes(getBasesAsString()); - } - - /** - * Gets the Phred base qualities without ASCII offset. - * @return Phred base qualities. - */ - public byte[] getQuals() { - byte[] quals = StringUtil.stringToBytes(getQualsAsString()); - for(int i = 0; i < quals.length; i++) quals[i] -= 33; - return quals; - } - - /** Returns bases in the reference allele as a String. For point genotypes, the string consists of a single - * character (reference base). For indel genotypes, the string is empty for insertions into - * the reference, or consists of deleted bases for deletions. - * - * @return reference allele, forward strand - */ - public String getFWDRefBases() { - return refBases; - } - - protected void setRefBases(String refBases) { - this.refBases = refBases; - } - - public List getFWDAlleles() { - return observedAlleles; - } - - protected void setFWDAlleles(List alleles) { - this.observedAlleles = alleles; - } - - // ---------------------------------------------------------------------- - // - // What kind of variant are we? - // - // ---------------------------------------------------------------------- - public boolean isSNP() { return varType == VariantType.SNP; } - public boolean isInsertion() { return varType == VariantType.INSERTION; } - public boolean isDeletion() { return varType == VariantType.DELETION ; } - public boolean isIndel() { return isInsertion() || isDeletion() || varType == VariantType.INDEL; } - public boolean isReference() { return varType == VariantType.NONE; } - - protected void setVariantType(VariantType variantType) { - this.varType = variantType; - } - - public boolean isHom() { - // implementation-dependent: here we use the fact that for ref and snps we actually use fixed static strings to remember the genotype - if ( ! isIndel() ) return ( observedAlleles.get(0).equals(observedAlleles.get(1)) ); - return ( isInsertion() || isDeletion() ) && observedAlleles.get(0).equals(observedAlleles.get(1) ); - } - - public boolean isHet() { - // implementation-dependent: here we use the fact that for ref and snps we actually use fixed static strings to remember the genotype - if ( ! isIndel() ) return ( !(observedAlleles.get(0).equals(observedAlleles.get(1))) ); - return isIndel() || ( ! observedAlleles.get(0).equals(observedAlleles.get(1) ) ); - } - - public double getVariantConfidence() { - return variantScore; - } - - protected void setVariantConfidence(double variantScore) { - this.variantScore = variantScore; - } - - public boolean isBiallelic() { - return nNonref < 2; - } - - protected void setNumNonRef(int nNonref) { - this.nNonref = nNonref; - } - - public double getConsensusConfidence() { - return consensusScore; - } - - protected void setConsensusConfidence(double consensusScore) { - this.consensusScore = consensusScore; - } - - public int length() { - return eventLength; - } - - protected void setLength(int eventLength) { - this.eventLength = eventLength; - } - - public boolean isIndelGenotype() { - return refBaseChar == '*'; - } - - - public boolean isPointGenotype() { - return ! isIndelGenotype(); - } - - /** Implements method required by GenotypeList interface. If this object represents - * an indel genotype, then it returns itself through this method. If this object is a - * point genotype, this method returns null. - * @return - */ - public SAMPileupFeature getIndelGenotype() { - if ( isIndelGenotype() ) return this; - else return null; - } - - /** Implements method required by GenotypeList interface. If this object represents - * a point genotype, then it returns itself through this method. If this object is an - * indel genotype, this method returns null. - * @return - */ - public SAMPileupFeature getPointGenotype() { - if ( isPointGenotype() ) return this; - else return null; - } - - /** Returns true if this object \em is an indel genotype (and thus - * indel genotype is what it only has). - * @return - */ - public boolean hasIndelGenotype() { - return isIndelGenotype(); - } - - /** Returns true if this object \em is a point genotype (and thus - * point genotype is what it only has. - * @return - */ - public boolean hasPointGenotype() { - return isPointGenotype(); - } - - - -} diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/DynamicClassResolutionException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/DynamicClassResolutionException.java deleted file mode 100644 index 4d280423e..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/DynamicClassResolutionException.java +++ /dev/null @@ -1,58 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.exceptions; - -import java.lang.reflect.InvocationTargetException; - -/** - * Class for handling common failures of dynamic class resolution - * - * User: depristo - * Date: Sep 3, 2010 - * Time: 2:24:09 PM - */ -public class DynamicClassResolutionException extends UserException { - public DynamicClassResolutionException(Class c, Exception ex) { - super(String.format("Could not create module %s because %s caused by exception %s", - c.getSimpleName(), moreInfo(ex), ex.getMessage())); - } - - private static String moreInfo(Exception ex) { - try { - throw ex; - } catch (InstantiationException e) { - return "BUG: cannot instantiate class: must be concrete class"; - } catch (NoSuchMethodException e) { - return "BUG: Cannot find expected constructor for class"; - } catch (IllegalAccessException e) { - return "Cannot instantiate class (Illegal Access)"; - } catch (InvocationTargetException e) { - return "Cannot instantiate class (Invocation failure)"; - } catch ( Exception e ) { - return String.format("an exception of type %s occurred",e.getClass().getSimpleName()); - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java deleted file mode 100644 index 40a730029..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ /dev/null @@ -1,489 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.exceptions; - -import net.sf.samtools.CigarOperator; -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMSequenceDictionary; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.broadinstitute.sting.utils.variant.GATKVCFIndexType; -import org.broadinstitute.variant.variantcontext.VariantContext; - -import java.io.File; - -/** - * Represents the common user errors detected by Sting / GATK - * - * Root class for all GATK user errors, as well as the container for errors themselves - * - * User: depristo - * Date: Sep 3, 2010 - * Time: 2:24:09 PM - */ -@DocumentedGATKFeature( - groupName = HelpConstants.DOCS_CAT_USRERR, - summary = "Errors caused by incorrect user behavior, such as bad files, bad arguments, etc." ) -public class UserException extends ReviewedStingException { - /** - * The URL where people can get help messages. Printed when an error occurs - */ - public static final String PHONE_HOME_DOCS_URL = "http://gatkforums.broadinstitute.org/discussion/1250/what-is-phone-home-and-how-does-it-affect-me#latest"; - - public UserException(String msg) { super(msg); } - public UserException(String msg, Throwable e) { super(msg, e); } - private UserException(Throwable e) { super("", e); } // cannot be called, private access - - protected static String getMessage(Throwable t) { - String message = t.getMessage(); - return message != null ? message : t.getClass().getName(); - } - - public static class CommandLineException extends UserException { - public CommandLineException(String message) { - super(String.format("Invalid command line: %s", message)); - } - } - - public static class MalformedReadFilterException extends CommandLineException { - public MalformedReadFilterException(String message) { - super(String.format("Malformed read filter: %s",message)); - } - } - - public static class IncompatibleReadFiltersException extends CommandLineException { - public IncompatibleReadFiltersException(final String filter1, final String filter2) { - super(String.format("Two read filters are enabled that are incompatible and cannot be used simultaneously: %s and %s", filter1, filter2)); - } - } - - public static class MalformedWalkerArgumentsException extends CommandLineException { - public MalformedWalkerArgumentsException(String message) { - super(String.format("Malformed walker argument: %s",message)); - } - } - - public static class UnsupportedCigarOperatorException extends UserException { - public UnsupportedCigarOperatorException(final CigarOperator co, final SAMRecord read, final String message) { - super(String.format( - "Unsupported CIGAR operator %s in read %s at %s:%d. %s", - co, - read.getReadName(), - read.getReferenceName(), - read.getAlignmentStart(), - message)); - } - } - - - public static class MalformedGenomeLoc extends UserException { - public MalformedGenomeLoc(String message, GenomeLoc loc) { - super(String.format("Badly formed genome loc: %s: %s", message, loc)); - } - - public MalformedGenomeLoc(String message) { - super(String.format("Badly formed genome loc: %s", message)); - } - } - - public static class BadInput extends UserException { - public BadInput(String message) { - super(String.format("Bad input: %s", message)); - } - } - - // todo -- fix up exception cause passing - public static class MissingArgument extends CommandLineException { - public MissingArgument(String arg, String message) { - super(String.format("Argument %s was missing: %s", arg, message)); - } - } - - public static class BadArgumentValue extends CommandLineException { - public BadArgumentValue(String arg, String message) { - super(String.format("Argument %s has a bad value: %s", arg, message)); - } - } - - public static class UnknownTribbleType extends CommandLineException { - public UnknownTribbleType(String type, String message) { - super(String.format("Unknown tribble type %s: %s", type, message)); - } - } - - - public static class BadTmpDir extends UserException { - public BadTmpDir(String message) { - super(String.format("Failure working with the tmp directory %s. Override with -Djava.io.tmpdir=X on the command line to a bigger/better file system. Exact error was %s", System.getProperties().get("java.io.tmpdir"), message)); - } - } - - public static class TooManyOpenFiles extends UserException { - public TooManyOpenFiles() { - super(String.format("There was a failure because there are too many files open concurrently; your system's open file handle limit is too small. See the unix ulimit command to adjust this limit")); - } - } - - public static class LocalParallelizationProblem extends UserException { - public LocalParallelizationProblem(final File file) { - super(String.format("There was a failure because temporary file %s could not be found while running the GATK with more than one thread. Possible causes for this problem include: your system's open file handle limit is too small, your output or temp directories do not have sufficient space, or just an isolated file system blip", file.getAbsolutePath())); - } - } - - public static class NotEnoughMemory extends UserException { - public NotEnoughMemory() { - super(String.format("There was a failure because you did not provide enough memory to run this program. See the -Xmx JVM argument to adjust the maximum heap size provided to Java")); - } - } - - public static class ErrorWritingBamFile extends UserException { - public ErrorWritingBamFile(String message) { - super(String.format("An error occurred when trying to write the BAM file. Usually this happens when there is not enough space in the directory to which the data is being written (generally the temp directory) or when your system's open file handle limit is too small. To tell Java to use a bigger/better file system use -Djava.io.tmpdir=X on the command line. The exact error was %s", message)); - } - } - - public static class NoSpaceOnDevice extends UserException { - public NoSpaceOnDevice() { - super("There is no space left on the device, so writing failed"); - } - } - - public static class CouldNotReadInputFile extends UserException { - public CouldNotReadInputFile(String message, Exception e) { - super(String.format("Couldn't read file because %s caused by %s", message, getMessage(e))); - } - - public CouldNotReadInputFile(File file) { - super(String.format("Couldn't read file %s", file.getAbsolutePath())); - } - - public CouldNotReadInputFile(File file, String message) { - super(String.format("Couldn't read file %s because %s", file.getAbsolutePath(), message)); - } - - public CouldNotReadInputFile(String file, String message) { - super(String.format("Couldn't read file %s because %s", file, message)); - } - - public CouldNotReadInputFile(File file, String message, Exception e) { - super(String.format("Couldn't read file %s because %s with exception %s", file.getAbsolutePath(), message, getMessage(e))); - } - - public CouldNotReadInputFile(File file, Exception e) { - this(file, getMessage(e)); - } - - public CouldNotReadInputFile(String message) { - super(message); - } - } - - - public static class CouldNotCreateOutputFile extends UserException { - public CouldNotCreateOutputFile(File file, String message, Exception e) { - super(String.format("Couldn't write file %s because %s with exception %s", file.getAbsolutePath(), message, getMessage(e))); - } - - public CouldNotCreateOutputFile(File file, String message) { - super(String.format("Couldn't write file %s because %s", file.getAbsolutePath(), message)); - } - - public CouldNotCreateOutputFile(String filename, String message, Exception e) { - super(String.format("Couldn't write file %s because %s with exception %s", filename, message, getMessage(e))); - } - - public CouldNotCreateOutputFile(File file, Exception e) { - super(String.format("Couldn't write file %s because exception %s", file.getAbsolutePath(), getMessage(e))); - } - - public CouldNotCreateOutputFile(String message, Exception e) { - super(message, e); - } - } - - public static class MissortedBAM extends UserException { - public MissortedBAM(SAMFileHeader.SortOrder order, File file, SAMFileHeader header) { - super(String.format("Missorted Input SAM/BAM files: %s is must be sorted in %s order but order was: %s", file, order, header.getSortOrder())); - } - - public MissortedBAM(SAMFileHeader.SortOrder order, String message) { - super(String.format("Missorted Input SAM/BAM files: files are not sorted in %s order; %s", order, message)); - } - - public MissortedBAM(SAMFileHeader.SortOrder order, SAMRecord read, String message) { - super(String.format("Missorted Input SAM/BAM file %s: file sorted in %s order but %s is required; %s", - read.getFileSource().getReader(), read.getHeader().getSortOrder(), order, message)); - } - - public MissortedBAM(String message) { - super(String.format("Missorted Input SAM/BAM files: %s", message)); - } - } - - public static class MalformedBAM extends UserException { - public MalformedBAM(SAMRecord read, String message) { - this(read.getFileSource() != null ? read.getFileSource().getReader().toString() : "(none)", message); - } - - public MalformedBAM(File file, String message) { - this(file.toString(), message); - } - - public MalformedBAM(String source, String message) { - super(String.format("SAM/BAM file %s is malformed: %s", source, message)); - } - } - - public static class MisencodedBAM extends UserException { - public MisencodedBAM(SAMRecord read, String message) { - this(read.getFileSource() != null ? read.getFileSource().getReader().toString() : "(none)", message); - } - - public MisencodedBAM(String source, String message) { - super(String.format("SAM/BAM file %s appears to be using the wrong encoding for quality scores: %s; please see the GATK --help documentation for options related to this error", source, message)); - } - } - - public static class MalformedVCF extends UserException { - public MalformedVCF(String message, String line) { - super(String.format("The provided VCF file is malformed at line %s: %s", line, message)); - } - - public MalformedVCF(String message) { - super(String.format("The provided VCF file is malformed: %s", message)); - } - - public MalformedVCF(String message, int lineNo) { - super(String.format("The provided VCF file is malformed at approximately line number %d: %s", lineNo, message)); - } - } - - public static class MalformedBCF2 extends UserException { - public MalformedBCF2( String message ) { - super(String.format("Malformed BCF2 file: %s", message)); - } - } - - public static class MalformedVCFHeader extends UserException { - public MalformedVCFHeader(String message) { - super(String.format("The provided VCF file has a malformed header: %s", message)); - } - } - - public static class ReadMissingReadGroup extends MalformedBAM { - public ReadMissingReadGroup(final SAMRecord read) { - super(read, String.format("Read %s is missing the read group (RG) tag, which is required by the GATK. Please use " + HelpConstants.forumPost("discussion/59/companion-utilities-replacereadgroups to fix this problem"), read.getReadName())); - } - } - - public static class ReadHasUndefinedReadGroup extends MalformedBAM { - public ReadHasUndefinedReadGroup(final SAMRecord read, final String rgID) { - super(read, String.format("Read %s uses a read group (%s) that is not defined in the BAM header, which is not valid. Please use " + HelpConstants.forumPost("discussion/59/companion-utilities-replacereadgroups to fix this problem"), read.getReadName(), rgID)); - } - } - - public static class VariantContextMissingRequiredField extends UserException { - public VariantContextMissingRequiredField(String field, VariantContext vc) { - super(String.format("Variant at %s:%d is is missing the required field %s", vc.getChr(), vc.getStart(), field)); - } - } - - public static class MissortedFile extends UserException { - public MissortedFile(File file, String message, Exception e) { - super(String.format("Missorted Input file: %s is must be sorted in coordinate order. %s and got error %s", file, message, getMessage(e))); - } - } - - public static class FailsStrictValidation extends UserException { - public FailsStrictValidation(File f, String message) { - super(String.format("File %s fails strict validation: %s", f.getAbsolutePath(), message)); - } - } - - public static class MalformedFile extends UserException { - public MalformedFile(String message) { - super(String.format("Unknown file is malformed: %s", message)); - } - - public MalformedFile(String message, Exception e) { - super(String.format("Unknown file is malformed: %s caused by %s", message, getMessage(e))); - } - - public MalformedFile(File f, String message) { - super(String.format("File %s is malformed: %s", f.getAbsolutePath(), message)); - } - - public MalformedFile(File f, String message, Exception e) { - super(String.format("File %s is malformed: %s caused by %s", f.getAbsolutePath(), message, getMessage(e))); - } - - public MalformedFile(String name, String message) { - super(String.format("File associated with name %s is malformed: %s", name, message)); - } - - public MalformedFile(String name, String message, Exception e) { - super(String.format("File associated with name %s is malformed: %s caused by %s", name, message, getMessage(e))); - } - } - - public static class CannotExecuteRScript extends UserException { - public CannotExecuteRScript(String message) { - super(String.format("Unable to execute RScript command: " + message)); - } - public CannotExecuteRScript(String message, Exception e) { - super(String.format("Unable to execute RScript command: " + message), e); - } - } - - public static class DeprecatedArgument extends CommandLineException { - public DeprecatedArgument(String param, String doc) { - super(String.format("The parameter %s is deprecated. %s",param,doc)); - } - } - - - public static class IncompatibleSequenceDictionaries extends UserException { - public IncompatibleSequenceDictionaries(String message, String name1, SAMSequenceDictionary dict1, String name2, SAMSequenceDictionary dict2) { - super(String.format("Input files %s and %s have incompatible contigs: %s.\n %s contigs = %s\n %s contigs = %s", - name1, name2, message, name1, ReadUtils.prettyPrintSequenceRecords(dict1), name2, ReadUtils.prettyPrintSequenceRecords(dict2))); - } - } - - public static class LexicographicallySortedSequenceDictionary extends UserException { - public LexicographicallySortedSequenceDictionary(String name, SAMSequenceDictionary dict) { - super(String.format("Lexicographically sorted human genome sequence detected in %s." - + "\nFor safety's sake the GATK requires human contigs in karyotypic order: 1, 2, ..., 10, 11, ..., 20, 21, 22, X, Y with M either leading or trailing these contigs." - + "\nThis is because all distributed GATK resources are sorted in karyotypic order, and your processing will fail when you need to use these files." - + "\nYou can use the ReorderSam utility to fix this problem: " + HelpConstants.forumPost("discussion/58/companion-utilities-reordersam") - + "\n %s contigs = %s", - name, name, ReadUtils.prettyPrintSequenceRecords(dict))); - } - } - - public static class DeprecatedWalker extends UserException { - public DeprecatedWalker(String walkerName, String version) { - super(String.format("Walker %s is no longer available in the GATK; it has been deprecated since version %s", walkerName, version)); - } - } - - public static class DeprecatedAnnotation extends UserException { - public DeprecatedAnnotation(String annotationName, String version) { - super(String.format("Annotation %s is no longer available in the GATK; it has been deprecated since version %s", annotationName, version)); - } - } - - public static class CannotExecuteQScript extends UserException { - public CannotExecuteQScript(String message) { - super(String.format("Unable to execute QScript: " + message)); - } - public CannotExecuteQScript(String message, Exception e) { - super(String.format("Unable to execute QScript: " + message), e); - } - } - - public static class CannotHandleGzippedRef extends UserException { - public CannotHandleGzippedRef() { - super("The GATK cannot process compressed (.gz) reference sequences. Please unzip the file and try again. Sorry for the inconvenience."); - } - } - - public static class MissingReferenceFaiFile extends UserException { - public MissingReferenceFaiFile( final File indexFile, final File fastaFile ) { - super(String.format("Fasta index file %s for reference %s does not exist. Please see %s for help creating it.", - indexFile.getAbsolutePath(), fastaFile.getAbsolutePath(), - HelpConstants.forumPost("discussion/1601/how-can-i-prepare-a-fasta-file-to-use-as-reference"))); - } - } - - public static class MissingReferenceDictFile extends UserException { - public MissingReferenceDictFile( final File dictFile, final File fastaFile ) { - super(String.format("Fasta dict file %s for reference %s does not exist. Please see %s for help creating it.", - dictFile.getAbsolutePath(), fastaFile.getAbsolutePath(), - HelpConstants.forumPost("discussion/1601/how-can-i-prepare-a-fasta-file-to-use-as-reference"))); - } - } - - public static class UnreadableKeyException extends UserException { - public UnreadableKeyException ( File f, Exception e ) { - super(String.format("Key file %s cannot be read (possibly the key file is corrupt?). Error was: %s. " + - "Please see %s for help.", - f.getAbsolutePath(), getMessage(e), PHONE_HOME_DOCS_URL)); - } - - public UnreadableKeyException ( String message, Exception e ) { - this(String.format("%s. Error was: %s", message, getMessage(e))); - } - - public UnreadableKeyException ( String message ) { - super(String.format("Key file cannot be read (possibly the key file is corrupt?): %s. " + - "Please see %s for help.", - message, PHONE_HOME_DOCS_URL)); - } - } - - public static class KeySignatureVerificationException extends UserException { - public KeySignatureVerificationException ( File f ) { - super(String.format("The signature in key file %s failed cryptographic verification. " + - "If this key was valid in the past, it's likely been revoked. " + - "Please see %s for help.", - f.getAbsolutePath(), PHONE_HOME_DOCS_URL)); - } - } - - public static class GVCFIndexException extends UserException { - public GVCFIndexException (GATKVCFIndexType indexType, int indexParameter) { - super(String.format("GVCF output requires a specific indexing strategy. Please re-run including the arguments " + - "-variant_index_type %s -variant_index_parameter %d.", - indexType, indexParameter)); - } - } - - /** - * A special exception that happens only in the case where - * the filesystem, by design or configuration, is completely unable - * to handle locking. This exception will specifically NOT be thrown - * in the case where the filesystem handles locking but is unable to - * acquire a lock due to concurrency. - */ - public static class FileSystemInabilityToLockException extends UserException { - public FileSystemInabilityToLockException( String message ) { - super(message); - } - - public FileSystemInabilityToLockException( String message, Exception innerException ) { - super(message,innerException); - } - } - - public static class IncompatibleRecalibrationTableParameters extends UserException { - public IncompatibleRecalibrationTableParameters(String s) { - super(s); - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeature.java b/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeature.java deleted file mode 100644 index 0390e32d7..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeature.java +++ /dev/null @@ -1,48 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.help; - -import java.lang.annotation.*; - -/** - * An annotation to identify a class as a GATK capability for documentation - * - * @author depristo - */ -@Documented -@Inherited -@Retention(RetentionPolicy.RUNTIME) -@Target(ElementType.TYPE) -public @interface DocumentedGATKFeature { - /** Should we actually document this feature, even through it's annotated? */ - public boolean enable() default true; - /** The overall group name (walkers, readfilters) this feature is associated with */ - public String groupName(); - /** A human readable summary of the purpose of this group of features */ - public String summary() default ""; - /** Are there links to other docs that we should include? CommandLineGATK.class for walkers, for example? */ - public Class[] extraDocs() default {}; -} diff --git a/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureObject.java b/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureObject.java deleted file mode 100644 index 7d6819f39..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureObject.java +++ /dev/null @@ -1,59 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.help; - -/** - * Documentation unit. Effectively a class version of the DocumentedGATKFeature. - * Immutable data structure. - * - * @author depristo - */ -class DocumentedGATKFeatureObject { - /** Which class are we documenting. Specific to each class being documented */ - private final Class classToDoc; - /** Are we enabled? */ - private final boolean enable; - private final String groupName, summary; - private final Class[] extraDocs; - - public DocumentedGATKFeatureObject(Class classToDoc, final boolean enable, final String groupName, final String summary, final Class[] extraDocs) { - this.classToDoc = classToDoc; - this.enable = enable; - this.groupName = groupName; - this.summary = summary; - this.extraDocs = extraDocs; - } - - public DocumentedGATKFeatureObject(Class classToDoc, final String groupName, final String summary) { - this(classToDoc, true, groupName, summary, new Class[]{}); - } - - public Class getClassToDoc() { return classToDoc; } - public boolean enable() { return enable; } - public String groupName() { return groupName; } - public String summary() { return summary; } - public Class[] extraDocs() { return extraDocs; } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/help/GATKDoclet.java b/public/java/src/org/broadinstitute/sting/utils/help/GATKDoclet.java deleted file mode 100644 index 63cb0900a..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/help/GATKDoclet.java +++ /dev/null @@ -1,519 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.help; - -import com.sun.javadoc.ClassDoc; -import com.sun.javadoc.RootDoc; -import freemarker.template.Configuration; -import freemarker.template.DefaultObjectWrapper; -import freemarker.template.Template; -import freemarker.template.TemplateException; -import org.apache.commons.io.FileUtils; -import org.apache.log4j.Level; -import org.apache.log4j.Logger; -import org.broad.tribble.FeatureCodec; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.walkers.qc.DocumentationTest; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.text.XReadLines; - -import java.io.*; -import java.util.*; - -/** - * Javadoc Doclet that combines javadoc, GATK ParsingEngine annotations, and FreeMarker - * templates to produce html formatted GATKDocs for walkers - * and other classes. - *

- * This document has the following workflow: - *

- * 1 -- walk the javadoc hierarchy, looking for class that have the - * DocumentedGATKFeature annotation or are in the type hierarchy in the - * static list of things to document, and are to be documented - * 2 -- construct for each a GATKDocWorkUnit, resulting in the complete - * set of things to document - * 3 -- for each unit, actually generate an html page documenting it - * as well as links to related features via their units. Writing - * of a specific class HTML is accomplished by a generate DocumentationHandler - * 4 -- write out an index of all units, organized by group - *

- * The documented classes are restricted to only those with @DocumentedGATKFeature - * annotation or are in the STATIC_DOCS class. - */ -public class GATKDoclet { - final protected static Logger logger = Logger.getLogger(GATKDoclet.class); - - /** - * Where we find the help FreeMarker templates - */ - final protected static File SETTINGS_DIR = new File("settings/helpTemplates"); - - /** - * Where we write the GATKDoc html directory - */ - final protected static File DESTINATION_DIR = new File("gatkdocs"); - - final private static String FORUM_KEY_FILE = "/local/gsa-engineering/gatkdocs_publisher/forum.key"; - // ---------------------------------------------------------------------- - // - // Global variables that are set on the command line by javadoc - // - // ---------------------------------------------------------------------- - protected static String buildTimestamp = null, absoluteVersion = null; - protected static boolean showHiddenFeatures = false; - - protected static boolean testOnly = false; - - /** - * Any class that's in this list will be included in the documentation - * when the -test argument is provided. Useful for debugging. - */ - private static final List> testOnlyKeepers = Arrays.asList( - DocumentationTest.class, CommandLineGATK.class, UserException.class); - - /** - * The javadoc root doc - */ - RootDoc rootDoc; - - /** - * The set of all things we are going to document - */ - Set myWorkUnits; - - /** - * A static list of DocumentedGATKFeatureObjects. Any class that is as or extends - * one of the DocumentedGATKFeatureObjects.clazz of this collection will also - * be documented, even if it doesn't have the @DocumentedGATKFeature annotation. Useful - * when you want to document things that implement an interface (annotations on java - * interfaces aren't inherited) or whose base class isn't under your control (tribble - * codecs). - */ - final static Collection STATIC_DOCS = new ArrayList(); - - static { - STATIC_DOCS.add(new DocumentedGATKFeatureObject(FeatureCodec.class, - HelpConstants.DOCS_CAT_RODCODECS, - "Tribble codecs for reading reference ordered data (ROD) files such as VCF or BED")); - } - - - /** - * Extracts the contents of certain types of javadoc and adds them to an XML file. - * - * @param rootDoc The documentation root. - * @return Whether the JavaDoc run succeeded. - * @throws java.io.IOException if output can't be written. - */ - public static boolean start(RootDoc rootDoc) throws IOException { - logger.setLevel(Level.INFO); - - // load arguments - for (String[] options : rootDoc.options()) { - if (options[0].equals("-build-timestamp")) - buildTimestamp = options[1]; - if (options[0].equals("-absolute-version")) - absoluteVersion = options[1]; - if (options[0].equals("-include -hidden")) - showHiddenFeatures = true; - if (options[0].equals("-test")) - testOnly = true; - } - - // process the docs - new GATKDoclet().processDocs(rootDoc); - - - return true; - } - - /** - * Validate the given options against options supported by this doclet. - * - * @param option Option to validate. - * @return Number of potential parameters; 0 if not supported. - */ - public static int optionLength(String option) { - if (option.equals("-build-timestamp") || - option.equals("-absolute-version") || - option.equals("-include-hidden")) { - return 2; - } else if (option.equals("-test")) - return 1; - else - return 0; - } - - /** - * Are we supposed to include @Hidden annotations in our documented output? - * - * @return - */ - public boolean showHiddenFeatures() { - return showHiddenFeatures; - } - - /** - * @param rootDoc - */ - private void processDocs(RootDoc rootDoc) { - // setup the global access to the root - this.rootDoc = rootDoc; - - try { - // basic setup - DESTINATION_DIR.mkdirs(); - FileUtils.copyFile(new File(SETTINGS_DIR + "/bootstrap.min.css"), new File(DESTINATION_DIR + "/bootstrap.min.css")); - FileUtils.copyFile(new File(SETTINGS_DIR + "/bootstrap.min.js"), new File(DESTINATION_DIR + "/bootstrap.min.js")); - FileUtils.copyFile(new File(SETTINGS_DIR + "/jquery.min.js"), new File(DESTINATION_DIR + "/jquery.min.js")); - // print the Version number - FileUtils.writeByteArrayToFile(new File(DESTINATION_DIR + "/current.version.txt"), getSimpleVersion(absoluteVersion).getBytes()); - - /* ------------------------------------------------------------------- */ - /* You should do this ONLY ONCE in the whole application life-cycle: */ - - Configuration cfg = new Configuration(); - // Specify the data source where the template files come from. - cfg.setDirectoryForTemplateLoading(SETTINGS_DIR); - // Specify how templates will see the data-model. This is an advanced topic... - cfg.setObjectWrapper(new DefaultObjectWrapper()); - - myWorkUnits = computeWorkUnits(); - - List> groups = new ArrayList>(); - Set seenDocumentationFeatures = new HashSet(); - List> data = new ArrayList>(); - for (GATKDocWorkUnit workUnit : myWorkUnits) { - data.add(workUnit.indexDataMap()); - if (!seenDocumentationFeatures.contains(workUnit.annotation.groupName())) { - groups.add(toMap(workUnit.annotation)); - seenDocumentationFeatures.add(workUnit.annotation.groupName()); - } - } - - for (GATKDocWorkUnit workUnit : myWorkUnits) { - processDocWorkUnit(cfg, workUnit, groups, data); - } - - processIndex(cfg, new ArrayList(myWorkUnits)); - - File forumKeyFile = new File(FORUM_KEY_FILE); - if (forumKeyFile.exists()) { - String forumKey = null; - // Read in a one-line file so we can do a for loop - for (String line : new XReadLines(forumKeyFile)) - forumKey = line; - updateForum(myWorkUnits, forumKey); - } - } catch (FileNotFoundException e) { - throw new RuntimeException(e); - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - private void updateForum(Set docWorkUnits, String forumKey) { - //first get list of posts that need to be added - List old = ForumAPIUtils.getPostedTools(forumKey); - - for (String s : old) - System.out.println(s); - - System.out.printf("Forum has %d items%n", old.size()); - System.out.printf("Docs have %d items%n", docWorkUnits.size()); - - List toAdd = new ArrayList(); - for (GATKDocWorkUnit tool : docWorkUnits) { - if (!old.contains(tool.name)) { - System.out.println("WILL POST: " + tool.name + " TO FORUM"); - toAdd.add(tool); - } - } - - //update using list - for (GATKDocWorkUnit tool : toAdd) { - //if ( tool.name.equals("ApplyRecalibration") ) - ForumAPIUtils.postToForum(tool, forumKey); - } - } - - /** - * Returns the set of all GATKDocWorkUnits that we are going to generate docs for. - * - * @return - */ - private Set computeWorkUnits() { - TreeSet m = new TreeSet(); - - for (ClassDoc doc : rootDoc.classes()) { - //logger.debug("Considering " + doc); - Class clazz = getClassForClassDoc(doc); - - // don't add anything that's not DocumentationTest if we are in test mode - if (clazz != null && testOnly && !testOnlyKeepers.contains(clazz)) - continue; - - //if ( clazz != null && clazz.getName().equals("org.broadinstitute.sting.gatk.walkers.annotator.AlleleBalance")) - // logger.debug("foo"); - - DocumentedGATKFeatureObject feature = getFeatureForClassDoc(doc); - DocumentedGATKFeatureHandler handler = createHandler(doc, feature); - if (handler != null && handler.includeInDocs(doc)) { - //logger.info("Generating documentation for class " + doc); - String filename = handler.getDestinationFilename(doc, clazz); - GATKDocWorkUnit unit = new GATKDocWorkUnit(doc.name(), - filename, feature.groupName(), feature, handler, doc, clazz, - buildTimestamp, absoluteVersion); - m.add(unit); - } - } - - return m; - } - - /** - * Create a handler capable of documenting the class doc according to feature. Returns - * null if no appropriate handler is found or doc shouldn't be documented at all. - * - * @param doc - * @param feature - * @return - */ - private DocumentedGATKFeatureHandler createHandler(ClassDoc doc, DocumentedGATKFeatureObject feature) { - if (feature != null) { - if (feature.enable()) { - DocumentedGATKFeatureHandler handler = new GenericDocumentationHandler(); - handler.setDoclet(this); - return handler; - } else { - logger.info("Skipping disabled Documentation for " + doc); - } - } - - return null; - } - - /** - * Returns the instantiated DocumentedGATKFeatureObject that describes the GATKDoc - * structure we will apply to Doc. - * - * @param doc - * @return null if this proves inappropriate or doc shouldn't be documented - */ - private DocumentedGATKFeatureObject getFeatureForClassDoc(ClassDoc doc) { - Class docClass = getClassForClassDoc(doc); - - if (docClass == null) - return null; // not annotated so it shouldn't be documented - - if (docClass.isAnnotationPresent(DocumentedGATKFeature.class)) { - DocumentedGATKFeature f = docClass.getAnnotation(DocumentedGATKFeature.class); - return new DocumentedGATKFeatureObject(docClass, f.enable(), f.groupName(), f.summary(), f.extraDocs()); - } else { - for (DocumentedGATKFeatureObject staticDocs : STATIC_DOCS) { - if (staticDocs.getClassToDoc().isAssignableFrom(docClass)) { - return new DocumentedGATKFeatureObject(docClass, staticDocs.enable(), staticDocs.groupName(), staticDocs.summary(), staticDocs.extraDocs()); - } - } - return null; - } - } - - /** - * Return the Java class described by the ClassDoc doc - * - * @param doc - * @return - */ - private Class getClassForClassDoc(ClassDoc doc) { - try { - // todo -- what do I need the ? extends Object to pass the compiler? - return (Class) DocletUtils.getClassForDoc(doc); - } catch (ClassNotFoundException e) { - //logger.warn("Couldn't find class for ClassDoc " + doc); - // we got a classdoc for a class we can't find. Maybe in a library or something - return null; - } catch (NoClassDefFoundError e) { - return null; - } catch (UnsatisfiedLinkError e) { - return null; // naughty BWA bindings - } - } - - /** - * Create the html index listing all of the GATKDocs features - * - * @param cfg - * @param indexData - * @throws IOException - */ - private void processIndex(Configuration cfg, List indexData) throws IOException { - /* Get or create a template */ - Template temp = cfg.getTemplate("generic.index.template.html"); - - /* Merge data-model with template */ - Writer out = new OutputStreamWriter(new FileOutputStream(new File(DESTINATION_DIR + "/index.html"))); - try { - temp.process(groupIndexData(indexData), out); - out.flush(); - } catch (TemplateException e) { - throw new ReviewedStingException("Failed to create GATK documentation", e); - } - } - - /** - * Helpful function to create the html index. Given all of the already run GATKDocWorkUnits, - * create the high-level grouping data listing individual features by group. - * - * @param indexData - * @return - */ - private Map groupIndexData(List indexData) { - // - // root -> data -> { summary -> y, filename -> z }, etc - // -> groups -> group1, group2, etc. - Map root = new HashMap(); - - Collections.sort(indexData); - - List> groups = new ArrayList>(); - Set seenDocumentationFeatures = new HashSet(); - List> data = new ArrayList>(); - for (GATKDocWorkUnit workUnit : indexData) { - data.add(workUnit.indexDataMap()); - if (!seenDocumentationFeatures.contains(workUnit.annotation.groupName())) { - groups.add(toMap(workUnit.annotation)); - seenDocumentationFeatures.add(workUnit.annotation.groupName()); - } - } - - //System.out.printf(groups.toString()); - - root.put("data", data); - root.put("groups", groups); - root.put("timestamp", buildTimestamp); - root.put("version", absoluteVersion); - - return root; - } - - /** - * Trivial helper routine that returns the map of name and summary given the annotation - * AND adds a super-category so that we can custom-order the categories in the index - * - * @param annotation - * @return - */ - private static final Map toMap(DocumentedGATKFeatureObject annotation) { - Map root = new HashMap(); - root.put("id", annotation.groupName().replaceAll("\\W", "")); - root.put("name", annotation.groupName()); - root.put("summary", annotation.summary()); - - /** - * Add-on super-category definitions. The assignments depend on parsing the names - * defined in HelpConstants.java so be careful of changing anything. - * Also, the super-category value strings need to be the same as used in the - * Freemarker template. This is all fairly clunky but the best I could do without - * making major changes to the DocumentedGATKFeatureObject. Doesn't help that - * Freemarker makes any scripting horribly awkward. - */ - final String supercatValue; - if (annotation.groupName().endsWith(" Tools")) supercatValue = "tools"; - else if (annotation.groupName().endsWith(" Utilities")) supercatValue = "utilities"; - else if (annotation.groupName().startsWith("Engine ")) supercatValue = "engine"; - else supercatValue = "other"; - - root.put("supercat", supercatValue); - - return root; - } - - /** - * Helper function that finding the GATKDocWorkUnit associated with class from among all of the work units - * - * @param c the class we are looking for - * @return the GATKDocWorkUnit whose .clazz.equals(c), or null if none could be found - */ - public final GATKDocWorkUnit findWorkUnitForClass(Class c) { - for (final GATKDocWorkUnit unit : this.myWorkUnits) - if (unit.clazz.equals(c)) - return unit; - return null; - } - - /** - * Return the ClassDoc associated with clazz - * - * @param clazz - * @return - */ - public ClassDoc getClassDocForClass(Class clazz) { - return rootDoc.classNamed(clazz.getName()); - } - - /** - * High-level function that processes a single DocWorkUnit unit using its handler - * - * @param cfg - * @param unit - * @param data - * @throws IOException - */ - private void processDocWorkUnit(Configuration cfg, GATKDocWorkUnit unit, List> groups, List> data) - throws IOException { - //System.out.printf("Processing documentation for class %s%n", unit.classDoc); - - unit.handler.processOne(unit); - unit.forTemplate.put("groups", groups); - unit.forTemplate.put("data", data); - // Get or create a template - Template temp = cfg.getTemplate(unit.handler.getTemplateName(unit.classDoc)); - - // Merge data-model with template - File outputPath = new File(DESTINATION_DIR + "/" + unit.filename); - try { - Writer out = new OutputStreamWriter(new FileOutputStream(outputPath)); - temp.process(unit.forTemplate, out); - out.flush(); - } catch (TemplateException e) { - throw new ReviewedStingException("Failed to create GATK documentation", e); - } - } - - private static String getSimpleVersion(String absoluteVersion) { - String[] parts = absoluteVersion.split("-"); - - // by skipping i=0, there is no trailing separator - for (int i = 1; i < 2; i++) { - parts[0] = parts[0].concat("-"); - parts[0] = parts[0].concat(parts[i]); - } - - return parts[0]; - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java b/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java deleted file mode 100644 index 893a8349b..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java +++ /dev/null @@ -1,920 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.help; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import com.sun.javadoc.ClassDoc; -import com.sun.javadoc.FieldDoc; -import com.sun.javadoc.Tag; -import org.apache.commons.lang.StringUtils; -import org.apache.log4j.Logger; -import org.broad.tribble.Feature; -import org.broadinstitute.sting.commandline.*; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.classloader.JVMUtils; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.StingException; - -import java.io.IOException; -import java.lang.annotation.Annotation; -import java.lang.reflect.*; -import java.util.*; - -/** - * - */ -public class GenericDocumentationHandler extends DocumentedGATKFeatureHandler { - private static Logger logger = Logger.getLogger(GenericDocumentationHandler.class); - - /** - * The max. length of the longest of --fullName -shortName argument name - * before we prefer the shorter option. - */ - private static final int MAX_DISPLAY_NAME = 30; - - /** - * The Class we are documenting - */ - private GATKDocWorkUnit toProcess; - - @Override - public boolean includeInDocs(ClassDoc doc) { - try { - Class type = DocletUtils.getClassForDoc(doc); - boolean hidden = !getDoclet().showHiddenFeatures() && type.isAnnotationPresent(Hidden.class); - return !hidden && JVMUtils.isConcrete(type); - } catch (ClassNotFoundException e) { - return false; - } - } - - - @Override - public String getTemplateName(ClassDoc doc) throws IOException { - return "generic.template.html"; - } - - @Override - public void processOne(GATKDocWorkUnit toProcessArg) { - this.toProcess = toProcessArg; - - //System.out.printf("%s class %s%n", toProcess.group, toProcess.classDoc); - Map root = new HashMap(); - - addHighLevelBindings(root); - addArgumentBindings(root); - addRelatedBindings(root); - root.put("group", toProcess.group); - - // Adding in retrieval of peripheral info (rf annotations etc) - getClazzAnnotations(toProcess.clazz, root); - - toProcess.setHandlerContent((String) root.get("summary"), root); - } - - /** - * Add high-level summary information about toProcess to root, such as its - * name, summary, description, version, etc. - * - * @param root - */ - protected void addHighLevelBindings(Map root) { - root.put("name", toProcess.classDoc.name()); - - // Extract overrides from the doc tags. - StringBuilder summaryBuilder = new StringBuilder(); - for (Tag tag : toProcess.classDoc.firstSentenceTags()) - summaryBuilder.append(tag.text()); - root.put("summary", summaryBuilder.toString()); - root.put("description", toProcess.classDoc.commentText().substring(summaryBuilder.toString().length())); - root.put("timestamp", toProcess.buildTimestamp); - root.put("version", toProcess.absoluteVersion); - - for (Tag tag : toProcess.classDoc.tags()) { - root.put(tag.name(), tag.text()); - } - } - - /** - * Add bindings describing related GATK capabilites to toProcess - * - * @param root - */ - protected void addRelatedBindings(Map root) { - List> extraDocsData = new ArrayList>(); - - // add in all of the explicitly related items - for (final Class extraDocClass : toProcess.annotation.extraDocs()) { - final GATKDocWorkUnit otherUnit = getDoclet().findWorkUnitForClass(extraDocClass); - if (otherUnit == null) - throw new ReviewedStingException("Requested extraDocs for class without any documentation: " + extraDocClass); - extraDocsData.add( - new HashMap() {{ - put("filename", otherUnit.filename); - put("name", otherUnit.name); - }}); - } - root.put("extradocs", extraDocsData); - } - - /** - * Add information about all of the arguments available to toProcess to root - * - * @param root - */ - protected void addArgumentBindings(Map root) { - ParsingEngine parsingEngine = createStandardGATKParsingEngine(); - - Map>> args = createArgumentMap(); - root.put("arguments", args); - try { - // loop over all of the arguments according to the parsing engine - for (final ArgumentSource argumentSource : parsingEngine.extractArgumentSources(DocletUtils.getClassForDoc(toProcess.classDoc))) { - // todo -- why can you have multiple ones? - ArgumentDefinition argDef = argumentSource.createArgumentDefinitions().get(0); - FieldDoc fieldDoc = getFieldDoc(toProcess.classDoc, argumentSource.field.getName()); - Map argBindings = docForArgument(fieldDoc, argumentSource, argDef); - if (!argumentSource.isHidden() || getDoclet().showHiddenFeatures()) { - final String kind = docKindOfArg(argumentSource); - - final Object value = argumentValue(toProcess.clazz, argumentSource); - if (value != null) - argBindings.put("defaultValue", prettyPrintValueString(value)); - - args.get(kind).add(argBindings); - args.get("all").add(argBindings); - } - } - - // sort the arguments - for (Map.Entry>> entry : args.entrySet()) { - entry.setValue(sortArguments(entry.getValue())); - } - } catch (ClassNotFoundException e) { - throw new RuntimeException(e); - } - } - - /** - * Return the argument kind (required, advanced, hidden, etc) of this argumentSource - * - * @param argumentSource - * @return - */ - @Requires("argumentSource != null") - @Ensures("result != null") - private String docKindOfArg(ArgumentSource argumentSource) { - if (argumentSource.isRequired()) { - if (argumentSource.isInput()) return "required_in"; - else if (argumentSource.isOutput()) return "required_out"; - else if (argumentSource.isFlag()) return "required_flag"; - else return "required_param"; - } - else if (argumentSource.isAdvanced()) { - if (argumentSource.isInput()) return "advanced_in"; - else if (argumentSource.isOutput()) return "advanced_out"; - else if (argumentSource.isFlag()) return "advanced_flag"; - else return "advanced_param"; - } - else if (argumentSource.isHidden()) return "hidden"; - else if (argumentSource.isDeprecated()) return "deprecated"; - else { - if (argumentSource.isInput()) return "optional_in"; - else if (argumentSource.isOutput()) return "optional_out"; - else if (argumentSource.isFlag()) return "optional_flag"; - else return "optional_param"; - } - } - - /** - * Attempts to determine the value of argumentSource in an instantiated version of c - * - * @param c - * @param argumentSource - * @return value of argumentSource, or null if this isn't possible - */ - @Requires({"c != null", "argumentSource != null"}) - private Object argumentValue(Class c, ArgumentSource argumentSource) { - // get the value of the field - // attempt to instantiate the class - final Object instance = makeInstanceIfPossible(toProcess.clazz); - if (instance != null) { - final Object value = getFieldValue(instance, argumentSource.field.getName()); - if (value != null) - return value; - - if (argumentSource.createsTypeDefault()) { - try { // handle the case where there's an implicit default - return argumentSource.typeDefaultDocString(); - } catch (ReviewedStingException e) { - ; // failed to create type default, don't worry about it - } - } - } - - return null; - } - - /** - * Create the argument map for holding class arguments - * - * @return - */ - private Map>> createArgumentMap() { - Map>> args = new HashMap>>(); - args.put("all", new ArrayList>()); - args.put("required_in", new ArrayList>()); - args.put("required_out", new ArrayList>()); - args.put("required_param", new ArrayList>()); - args.put("required_flag", new ArrayList>()); - args.put("optional_in", new ArrayList>()); - args.put("optional_out", new ArrayList>()); - args.put("optional_param", new ArrayList>()); - args.put("optional_flag", new ArrayList>()); - args.put("advanced_in", new ArrayList>()); - args.put("advanced_out", new ArrayList>()); - args.put("advanced_param", new ArrayList>()); - args.put("advanced_flag", new ArrayList>()); - args.put("hidden", new ArrayList>()); - args.put("deprecated", new ArrayList>()); - return args; - } - - - /** - * Sorts the individual argument list in unsorted according to CompareArgumentsByName - * - * @param unsorted - * @return - */ - private List> sortArguments(List> unsorted) { - Collections.sort(unsorted, new CompareArgumentsByName()); - return unsorted; - } - - /** - * Sort arguments by case-insensitive comparison ignoring the -- and - prefixes - */ - private class CompareArgumentsByName implements Comparator> { - public int compare(Map x, Map y) { - return elt(x).compareTo(elt(y)); - } - - private String elt(Map m) { - String v = m.get("name").toString().toLowerCase(); - if (v.startsWith("--")) - return v.substring(2); - else if (v.startsWith("-")) - return v.substring(1); - else - throw new RuntimeException("Expect to see arguments beginning with at least one -, but found " + v); - } - } - - /** - * Umbrella function that groups the collection of values for specific annotations applied to an - * instance of class c. Lists of collected values are added directly to the "toProcess" object. - * Requires being able to instantiate the class. - * - * @param classToProcess the object to instantiate and query for the annotation - * @param root the root of the document handler, to which we'll store collected annotations - */ - private void getClazzAnnotations(Class classToProcess, Map root) { - // - // attempt to instantiate the class - final Object instance = makeInstanceIfPossible(classToProcess); - if (instance != null) { - final Class myClass = instance.getClass(); - // Get parallelism options - final HashSet> parallelOptions = getParallelism(myClass, new HashSet>()); - root.put("parallel", parallelOptions); - // Get annotation info (what type of annotation, standard etc.) - final HashSet annotInfo = getAnnotInfo(myClass, new HashSet()); - root.put("annotinfo", StringUtils.join(annotInfo, ", ")); - // Get annotation field (whether it goes in INFO or FORMAT) - root.put("annotfield", getAnnotField(myClass)); - // Get walker type if applicable - root.put("walkertype", getWalkerType(myClass)); - // Get partition type if applicable - root.put("partitiontype", getPartitionType(myClass)); - // Get read filter annotations (ReadFilters) if applicable - final HashSet> bucket= getReadFilters(myClass, new HashSet>()); - root.put("readfilters", bucket); - // Get default downsampling settings - final HashMap dsSettings = getDownSamplingSettings(myClass, new HashMap()); - root.put("downsampling", dsSettings); - // Get reference window size settings - final HashMap refwindow = getRefWindow(myClass, new HashMap()); - root.put("refwindow", refwindow); - // Get ActiveRegion size settings - final HashMap activeRegion = getActiveRegion(myClass, new HashMap()); - root.put("activeregion", activeRegion); - // anything else? - } else { - // put empty items to avoid blowups - root.put("parallel", new HashSet()); - root.put("annotinfo", ""); - root.put("annotfield", ""); - root.put("walkertype", ""); - root.put("partitiontype", ""); - root.put("readfilters", new HashSet>()); - root.put("downsampling", new HashMap()); - root.put("refwindow", new HashMap()); - root.put("activeregion", new HashMap()); - } - } - - /** - * Utility function that checks which parallelism options are available for an instance of class c. - * - * @param myClass the class to query for the interfaces - * @param parallelOptions an empty HashSet in which to collect the info - * @return a hash set of parallelism options, otherwise an empty set - */ - private HashSet> getParallelism(Class myClass, HashSet> parallelOptions) { - // - // Retrieve interfaces - Class[] implementedInterfaces = myClass.getInterfaces(); - for (Class intfClass : implementedInterfaces) { - final HashMap nugget = new HashMap(); - if (intfClass.getSimpleName().equals("TreeReducible")) { - nugget.put("name", intfClass.getSimpleName()); - nugget.put("arg", HelpConstants.ARG_TREEREDUCIBLE); - nugget.put("link", HelpConstants.CMDLINE_GATK_URL + "#" + HelpConstants.ARG_TREEREDUCIBLE); - } else if (intfClass.getSimpleName().equals("NanoSchedulable")) { - nugget.put("name", intfClass.getSimpleName()); - nugget.put("arg", HelpConstants.ARG_NANOSCHEDULABLE); - nugget.put("link", HelpConstants.CMDLINE_GATK_URL + "#" + HelpConstants.ARG_NANOSCHEDULABLE); - } else { - continue; - } - parallelOptions.add(nugget); - } - // Look up superclasses recursively - final Class mySuperClass = myClass.getSuperclass(); - if (mySuperClass.getSimpleName().equals("Object")) { - return parallelOptions; - } - return getParallelism(mySuperClass, parallelOptions); - } - - /** - * Utility function that looks up whether the annotation goes in INFO or FORMAT field. - * - * @param myClass the class to query for the interfaces - * @return a String specifying the annotation field - */ - private final String getAnnotField(Class myClass) { - // - // Look up superclasses recursively until we find either - // GenotypeAnnotation or InfoFieldAnnotation - final Class mySuperClass = myClass.getSuperclass(); - if (mySuperClass == InfoFieldAnnotation.class) { - return "INFO (variant-level)"; - } else if (mySuperClass == GenotypeAnnotation.class) { - return "FORMAT (sample genotype-level)"; - } else if (mySuperClass.getSimpleName().equals("Object")) { - return ""; - } - return getAnnotField(mySuperClass); - } - - /** - * Utility function that determines the annotation type for an instance of class c. - * - * @param myClass the class to query for the interfaces - * @param annotInfo an empty HashSet in which to collect the info - * @return a hash set of the annotation types, otherwise an empty set - */ - private HashSet getAnnotInfo(Class myClass, HashSet annotInfo) { - // - // Retrieve interfaces - Class[] implementedInterfaces = myClass.getInterfaces(); - for (Class intfClass : implementedInterfaces) { - if (intfClass.getName().contains("Annotation")) { - annotInfo.add(intfClass.getSimpleName()); - } - } - // Look up superclasses recursively - final Class mySuperClass = myClass.getSuperclass(); - if (mySuperClass.getSimpleName().equals("Object")) { - return annotInfo; - } - return getAnnotInfo(mySuperClass, annotInfo); - } - - /** - * Utility function that determines the default downsampling settings for an instance of class c. - * - * @param myClass the class to query for the settings - * @param dsSettings an empty HashMap in which to collect the info - * @return a hash set of the downsampling settings, otherwise an empty set - */ - private HashMap getDownSamplingSettings(Class myClass, HashMap dsSettings) { - // - // Retrieve annotation - if (myClass.isAnnotationPresent(Downsample.class)) { - final Annotation thisAnnotation = myClass.getAnnotation(Downsample.class); - if(thisAnnotation instanceof Downsample) { - final Downsample dsAnnotation = (Downsample) thisAnnotation; - dsSettings.put("by", dsAnnotation.by().toString()); - dsSettings.put("to_cov", dsAnnotation.toCoverage()); - } - } - return dsSettings; - } - - /** - * Utility function that determines the reference window size for an instance of class c. - * - * @param myClass the class to query for the settings - * @param refWindow an empty HashMap in which to collect the info - * @return a HashMap of the window start and stop, otherwise an empty HashMap - */ - private HashMap getRefWindow(Class myClass, HashMap refWindow) { - // - // Retrieve annotation - if (myClass.isAnnotationPresent(Reference.class)) { - final Annotation thisAnnotation = myClass.getAnnotation(Reference.class); - if(thisAnnotation instanceof Reference) { - final Reference refAnnotation = (Reference) thisAnnotation; - refWindow.put("start", refAnnotation.window().start()); - refWindow.put("stop", refAnnotation.window().stop()); - } - } - return refWindow; - } - - /** - * Utility function that determines the ActiveRegion settings for an instance of class c. - * - * @param myClass the class to query for the settings - * @param activeRegion an empty HashMap in which to collect the info - * @return a HashMap of the ActiveRegion parameters, otherwise an empty HashMap - */ - private HashMap getActiveRegion(Class myClass, HashMap activeRegion) { - // - // Retrieve annotation - if (myClass.isAnnotationPresent(ActiveRegionTraversalParameters.class)) { - final Annotation thisAnnotation = myClass.getAnnotation(ActiveRegionTraversalParameters.class); - if(thisAnnotation instanceof ActiveRegionTraversalParameters) { - final ActiveRegionTraversalParameters arAnnotation = (ActiveRegionTraversalParameters) thisAnnotation; - activeRegion.put("ext", arAnnotation.extension()); - activeRegion.put("max", arAnnotation.maxRegion()); - activeRegion.put("min", arAnnotation.minRegion()); - } - } - return activeRegion; - } - - /** - * Utility function that determines the partition type of an instance of class c. - * - * @param myClass the class to query for the annotation - * @return the partition type if applicable, otherwise an empty string - */ - private String getPartitionType(Class myClass) { - // - // Retrieve annotation - if (myClass.isAnnotationPresent(PartitionBy.class)) { - final Annotation thisAnnotation = myClass.getAnnotation(PartitionBy.class); - if(thisAnnotation instanceof PartitionBy) { - final PartitionBy partAnnotation = (PartitionBy) thisAnnotation; - return partAnnotation.value().toString(); - } - } - return ""; - } - - /** - * Utility function that determines the type of walker subclassed by an instance of class c. - * - * @param myClass the class to query for the annotation - * @return the type of walker if applicable, otherwise an empty string - */ - private String getWalkerType(Class myClass) { - // - // Look up superclasses recursively until we find either Walker or Object - final Class mySuperClass = myClass.getSuperclass(); - if (mySuperClass.getSimpleName().equals("Walker")) { - return myClass.getSimpleName(); - } else if (mySuperClass.getSimpleName().equals("Object")) { - return ""; - } - return getWalkerType(mySuperClass); - } - - /** - * Utility function that finds the values of ReadFilters annotation applied to an instance of class c. - * - * @param myClass the class to query for the annotation - * @param bucket a container in which we store the annotations collected - * @return a hash set of values, otherwise an empty set - */ - private HashSet> getReadFilters(Class myClass, HashSet> bucket) { - // - // Retrieve annotation - if (myClass.isAnnotationPresent(ReadFilters.class)) { - final Annotation thisAnnotation = myClass.getAnnotation(ReadFilters.class); - if(thisAnnotation instanceof ReadFilters) { - final ReadFilters rfAnnotation = (ReadFilters) thisAnnotation; - for (Class filter : rfAnnotation.value()) { - // make hashmap of simplename and url - final HashMap nugget = new HashMap(); - nugget.put("name", filter.getSimpleName()); - nugget.put("filename", GATKDocUtils.htmlFilenameForClass(filter)); - bucket.add(nugget); - } - } - } - // Look up superclasses recursively - final Class mySuperClass = myClass.getSuperclass(); - if (mySuperClass.getSimpleName().equals("Object")) { - return bucket; - } - return getReadFilters(mySuperClass, bucket); - } - - - /** - * Utility function that finds the value of fieldName in any fields of ArgumentCollection fields in - * instance of class c. - * - * @param instance the object to query for the field value - * @param fieldName the name of the field we are looking for in instance - * @return The value assigned to field in the ArgumentCollection, otherwise null - */ - private Object getFieldValue(Object instance, String fieldName) { - // - // subtle note. If you have a field named X that is an ArgumentCollection that - // contains a field X as well, you need only consider fields in the argumentCollection, not - // matching the argument itself. - // - // @ArgumentCollection - // protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); - // - - for (Field field : JVMUtils.getAllFields(instance.getClass())) { - if (field.isAnnotationPresent(ArgumentCollection.class)) { - //System.out.printf("Searching for %s in argument collection field %s%n", fieldName, field); - Object fieldValue = JVMUtils.getFieldValue(field, instance); - Object value = getFieldValue(fieldValue, fieldName); - if (value != null) - return value; - } else if (field.getName().equals(fieldName)) { - return JVMUtils.getFieldValue(field, instance); - } - } - - return null; - } - - /** - * Pretty prints value - *

- * Assumes value != null - * - * @param value - * @return - */ - private Object prettyPrintValueString(Object value) { - if (value.getClass().isArray()) { - Class type = value.getClass().getComponentType(); - if (boolean.class.isAssignableFrom(type)) - return Arrays.toString((boolean[]) value); - if (byte.class.isAssignableFrom(type)) - return Arrays.toString((byte[]) value); - if (char.class.isAssignableFrom(type)) - return Arrays.toString((char[]) value); - if (double.class.isAssignableFrom(type)) - return Arrays.toString((double[]) value); - if (float.class.isAssignableFrom(type)) - return Arrays.toString((float[]) value); - if (int.class.isAssignableFrom(type)) - return Arrays.toString((int[]) value); - if (long.class.isAssignableFrom(type)) - return Arrays.toString((long[]) value); - if (short.class.isAssignableFrom(type)) - return Arrays.toString((short[]) value); - if (Object.class.isAssignableFrom(type)) - return Arrays.toString((Object[]) value); - else - throw new RuntimeException("Unexpected array type in prettyPrintValue. Value was " + value + " type is " + type); - } else if (RodBinding.class.isAssignableFrom(value.getClass())) { - // annoying special case to handle the UnBound() constructor - return "none"; - } else if (value instanceof String) { - return value.equals("") ? "\"\"" : value; - } else { - return value.toString(); - } - } - - /** - * Attempt to instantiate class c, if possible. Returns null if this proves impossible. - * - * @param c - * @return - */ - private Object makeInstanceIfPossible(Class c) { - Object instance = null; - try { - // don't try to make something where we will obviously fail - if (!c.isEnum() && !c.isAnnotation() && !c.isAnonymousClass() && - !c.isArray() && !c.isPrimitive() & JVMUtils.isConcrete(c)) { - instance = c.newInstance(); - //System.out.printf("Created object of class %s => %s%n", c, instance); - return instance; - } else - return null; - } catch (IllegalAccessException e) { - } catch (InstantiationException e) { - } catch (ExceptionInInitializerError e) { - } catch (SecurityException e) { - } - // this last one is super dangerous, but some of these methods catch ClassNotFoundExceptions - // and rethrow then as RuntimeExceptions - catch (RuntimeException e) { - } - - return instance; - } - - - /** - * Create an instance of the GATK parsing engine, for argument processing with GATKDoclet - * - * @return - */ - private ParsingEngine createStandardGATKParsingEngine() { - CommandLineProgram clp = new CommandLineGATK(); - try { - CommandLineProgram.start(clp, new String[]{}, true); - return clp.parser; - } catch (Exception e) { - throw new RuntimeException(e); - } - } - - /** - * Gets the javadocs associated with field name in classDoc. Throws a - * runtime exception if this proves impossible. - * - * @param classDoc - * @param name - * @return - */ - private FieldDoc getFieldDoc(ClassDoc classDoc, String name) { - return getFieldDoc(classDoc, name, true); - } - - /** - * Recursive helper routine to getFieldDoc() - * - * @param classDoc - * @param name - * @param primary - * @return - */ - private FieldDoc getFieldDoc(ClassDoc classDoc, String name, boolean primary) { - //System.out.printf("Looking for %s in %s%n", name, classDoc.name()); - for (FieldDoc fieldDoc : classDoc.fields(false)) { - //System.out.printf("fieldDoc " + fieldDoc + " name " + fieldDoc.name()); - if (fieldDoc.name().equals(name)) - return fieldDoc; - - Field field = DocletUtils.getFieldForFieldDoc(fieldDoc); - if (field == null) - throw new RuntimeException("Could not find the field corresponding to " + fieldDoc + ", presumably because the field is inaccessible"); - if (field.isAnnotationPresent(ArgumentCollection.class)) { - ClassDoc typeDoc = getRootDoc().classNamed(fieldDoc.type().qualifiedTypeName()); - if (typeDoc == null) - throw new ReviewedStingException("Tried to get javadocs for ArgumentCollection field " + fieldDoc + " but could't find the class in the RootDoc"); - else { - FieldDoc result = getFieldDoc(typeDoc, name, false); - if (result != null) - return result; - // else keep searching - } - } - } - - // if we didn't find it here, wander up to the superclass to find the field - if (classDoc.superclass() != null) { - return getFieldDoc(classDoc.superclass(), name, false); - } - - if (primary) - throw new RuntimeException("No field found for expected field " + name); - else - return null; - } - - /** - * Returns a Pair of (main, synonym) names for argument with fullName s1 and - * shortName s2. The main is selected to be the longest of the two, provided - * it doesn't exceed MAX_DISPLAY_NAME, in which case the shorter is taken. - * - * @param s1 the short argument name without -, or null if not provided - * @param s2 the long argument name without --, or null if not provided - * @return A pair of fully qualified names (with - or --) for the argument. The first - * element is the primary display name while the second (potentially null) is a - * synonymous name. - */ - Pair displayNames(String s1, String s2) { - s1 = s1 == null ? null : "-" + s1; - s2 = s2 == null ? null : "--" + s2; - - if (s1 == null) return new Pair(s2, null); - if (s2 == null) return new Pair(s1, null); - - String l = s1.length() > s2.length() ? s1 : s2; - String s = s1.length() > s2.length() ? s2 : s1; - - if (l.length() > MAX_DISPLAY_NAME) - return new Pair(s, l); - else - return new Pair(l, s); - } - - /** - * Returns a human readable string that describes the Type type of a GATK argument. - *

- * This will include parameterized types, so that Set{T} shows up as Set(T) and not - * just Set in the docs. - * - * @param type - * @return - */ - protected String argumentTypeString(Type type) { - if (type instanceof ParameterizedType) { - ParameterizedType parameterizedType = (ParameterizedType) type; - List subs = new ArrayList(); - for (Type actualType : parameterizedType.getActualTypeArguments()) - subs.add(argumentTypeString(actualType)); - return argumentTypeString(((ParameterizedType) type).getRawType()) + "[" + Utils.join(",", subs) + "]"; - } else if (type instanceof GenericArrayType) { - return argumentTypeString(((GenericArrayType) type).getGenericComponentType()) + "[]"; - } else if (type instanceof WildcardType) { - throw new RuntimeException("We don't support wildcards in arguments: " + type); - } else if (type instanceof Class) { - return ((Class) type).getSimpleName(); - } else { - throw new StingException("Unknown type: " + type); - } - } - - /** - * Helper routine that returns the Feature.class required by a RodBinding, - * either T for RodBinding{T} or List{RodBinding{T}}. Returns null if - * the Type doesn't fit either model. - * - * @param type - * @return - */ - protected Class getFeatureTypeIfPossible(Type type) { - if (type instanceof ParameterizedType) { - ParameterizedType paramType = (ParameterizedType) type; - if (RodBinding.class.isAssignableFrom((Class) paramType.getRawType())) { - return (Class) JVMUtils.getParameterizedTypeClass(type); - } else { - for (Type paramtype : paramType.getActualTypeArguments()) { - Class x = getFeatureTypeIfPossible(paramtype); - if (x != null) - return x; - } - } - } - - return null; - } - - /** - * High-level entry point for creating a FreeMarker map describing the GATK argument - * source with definition def, with associated javadoc fieldDoc. - * - * @param fieldDoc - * @param source - * @param def - * @return a non-null Map binding argument keys with their values - */ - protected Map docForArgument(FieldDoc fieldDoc, ArgumentSource source, ArgumentDefinition def) { - Map root = new HashMap(); - Pair names = displayNames(def.shortName, def.fullName); - - root.put("name", names.getFirst()); - - if (names.getSecond() != null) - root.put("synonyms", names.getSecond()); - - root.put("required", def.required ? "yes" : "no"); - - // type of the field - root.put("type", argumentTypeString(source.field.getGenericType())); - - Class featureClass = getFeatureTypeIfPossible(source.field.getGenericType()); - if (featureClass != null) { - // deal with the allowable types - FeatureManager manager = new FeatureManager(); - List rodTypes = new ArrayList(); - for (FeatureManager.FeatureDescriptor descriptor : manager.getByFeature(featureClass)) { - rodTypes.add(String.format("%s", - GATKDocUtils.htmlFilenameForClass(descriptor.getCodecClass()), - descriptor.getName())); - } - - root.put("rodTypes", Utils.join(", ", rodTypes)); - } - - // summary and fulltext - root.put("summary", def.doc != null ? def.doc : ""); - root.put("fulltext", fieldDoc.commentText()); - - // What are our enum options? - if (def.validOptions != null) - root.put("options", docForEnumArgument(source.field.getType())); - - // general attributes - List attributes = new ArrayList(); - if (def.required) attributes.add("required"); - if (source.isDeprecated()) attributes.add("deprecated"); - if (attributes.size() > 0) - root.put("attributes", Utils.join(", ", attributes)); - - return root; - } - - /** - * Helper routine that provides a FreeMarker map for an enumClass, grabbing the - * values of the enum and their associated javadoc documentation. - * - * @param enumClass - * @return - */ - @Requires("enumClass.isEnum()") - private List> docForEnumArgument(final Class enumClass) { - final ClassDoc doc = this.getDoclet().getClassDocForClass(enumClass); - if ( doc == null ) - throw new RuntimeException("Tried to get docs for enum " + enumClass + " but got null instead"); - - final Set enumConstantFieldNames = enumConstantsNames(enumClass); - - final List> bindings = new ArrayList>(); - for (final FieldDoc fieldDoc : doc.fields(false)) { - if (enumConstantFieldNames.contains(fieldDoc.name()) ) - bindings.add( - new HashMap() {{ - put("name", fieldDoc.name()); - put("summary", fieldDoc.commentText()); - }}); - } - - return bindings; - } - - /** - * Returns the name of the fields that are enum constants according to reflection - * - * @return a non-null set of fields that are enum constants - */ - private Set enumConstantsNames(final Class enumClass) { - final Set enumConstantFieldNames = new HashSet(); - - for ( final Field field : enumClass.getFields() ) { - if ( field.isEnumConstant() ) - enumConstantFieldNames.add(field.getName()); - } - - return enumConstantFieldNames; - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/help/HelpConstants.java b/public/java/src/org/broadinstitute/sting/utils/help/HelpConstants.java deleted file mode 100644 index 2ed35d848..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/help/HelpConstants.java +++ /dev/null @@ -1,64 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.help; - -public class HelpConstants { - - public final static String BASE_GATK_URL = "http://www.broadinstitute.org/gatk"; - public final static String GATK_DOCS_URL = BASE_GATK_URL + "/gatkdocs/"; - public final static String GATK_FORUM_URL = "http://gatkforums.broadinstitute.org/"; - public final static String GATK_FORUM_API_URL = "https://gatkforums.broadinstitute.org/api/v1/"; - - /** - * Arguments for parallelism options - */ - public final static String ARG_TREEREDUCIBLE = "-nt"; - public final static String ARG_NANOSCHEDULABLE = "-nct"; - public final static String CMDLINE_GATK_URL = GATK_DOCS_URL + "org_broadinstitute_sting_gatk_CommandLineGATK.html"; - - /** - * Definition of the group names / categories of tools. - * The names get parsed to make supercategories in the doc index, - * so be careful when making big changes -- see GATKDoclet.java toMap() - */ - public final static String DOCS_CAT_DATA = "Sequence Data Processing Tools"; - public final static String DOCS_CAT_QC = "Diagnostics and Quality Control Tools"; - public final static String DOCS_CAT_ENGINE = "Engine Parameters (available to all tools)"; - public final static String DOCS_CAT_RF = "Read Filters"; - public final static String DOCS_CAT_REFUTILS = "Reference Utilities"; - public final static String DOCS_CAT_RODCODECS = "ROD Codecs"; - public final static String DOCS_CAT_USRERR = "User Exceptions"; - public final static String DOCS_CAT_VALIDATION = "Validation Utilities"; - public final static String DOCS_CAT_ANNOT = "Variant Annotations"; - public final static String DOCS_CAT_VARDISC = "Variant Discovery Tools"; - public final static String DOCS_CAT_VARMANIP = "Variant Evaluation and Manipulation Tools"; - public final static String DOCS_CAT_TEST = "Testing Tools"; - public final static String DOCS_CAT_HELPUTILS = "Help Utilities"; - - public static String forumPost(String post) { - return GATK_FORUM_URL + post; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/help/HelpFormatter.java b/public/java/src/org/broadinstitute/sting/utils/help/HelpFormatter.java deleted file mode 100644 index d700bff28..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/help/HelpFormatter.java +++ /dev/null @@ -1,317 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.help; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.commandline.*; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.text.TextFormattingUtils; - -import java.text.DateFormat; -import java.text.SimpleDateFormat; -import java.util.*; -/** - * Print out help for Sting command-line applications. - */ - -public class HelpFormatter { - /** our log, which we want to capture anything from org.broadinstitute.sting */ - private static Logger logger = Logger.getLogger(HelpFormatter.class); - - public static final int FIELD_SEPARATION_WIDTH = 3; - - /** - * Prints the help, given a collection of argument definitions. - * @param applicationDetails Application details - * @param argumentDefinitions Argument definitions for which help should be printed. - */ - public void printHelp( ApplicationDetails applicationDetails, ArgumentDefinitions argumentDefinitions ) { - List argumentGroups = prepareArgumentGroups( argumentDefinitions ); - - List header = applicationDetails.applicationHeader; - String barrier = createBarrier(header); - - System.out.printf("%s%n",barrier); - for(String headerLine: header) - System.out.printf("%s%n",headerLine); - System.out.printf("%s%n",barrier); - for(String attributionLine: applicationDetails.attribution) - System.out.printf("%s%n",attributionLine); - System.out.printf("%s%n",barrier); - - String synopsis = getSynopsis(applicationDetails.runningInstructions,argumentGroups); - String additionalDetails = applicationDetails.additionalHelp != null ? applicationDetails.additionalHelp : ""; - String detailedDescription = getDetailed(argumentGroups); - - System.out.printf("%s%n%s%n%s%n",synopsis,detailedDescription,additionalDetails ); - } - - /** - * Gets the synopsis: the actual command to run. - * @param runningInstructions Instructions on how to run hte application. - * @param argumentGroups Program arguments sorted in order of definition group displays. - * @return A synopsis line. - */ - private String getSynopsis( String runningInstructions, - List argumentGroups ) { - // Build out the synopsis all as one long line. - StringBuilder lineBuilder = new StringBuilder(); - Formatter lineFormatter = new Formatter( lineBuilder ); - - lineFormatter.format("java %s", runningInstructions); - - for( ArgumentDefinitionGroup argumentGroup: argumentGroups ) { - for( ArgumentDefinition argumentDefinition: argumentGroup.argumentDefinitions ) { - if(argumentDefinition.isHidden) - continue; - lineFormatter.format(" "); - if( !argumentDefinition.required ) lineFormatter.format("["); - if( argumentDefinition.shortName != null ) - lineFormatter.format("-%s", argumentDefinition.shortName); - else - lineFormatter.format("--%s", argumentDefinition.fullName); - if( !argumentDefinition.isFlag ) - lineFormatter.format(" <%s>", argumentDefinition.fullName); - if( !argumentDefinition.required ) lineFormatter.format("]"); - } - } - - // Word wrap the synopsis. - List wrappedSynopsis = TextFormattingUtils.wordWrap( lineBuilder.toString(), TextFormattingUtils.DEFAULT_LINE_WIDTH ); - - String header = "usage: "; - int headerLength = header.length(); - - StringBuilder synopsisBuilder = new StringBuilder(); - Formatter synopsisFormatter = new Formatter(synopsisBuilder); - for( String synopsisLine: wrappedSynopsis ) { - synopsisFormatter.format("%" + headerLength + "s%s%n", header, synopsisLine); - header = ""; - } - - return synopsisBuilder.toString(); - } - - /** - * Gets detailed output about each argument type. - * @param argumentGroups Collection of program arguments sorted according to how they should be shown. - * @return Detailed text about all arguments. - */ - private String getDetailed( List argumentGroups ) { - StringBuilder builder = new StringBuilder(); - - for( ArgumentDefinitionGroup argumentGroup: argumentGroups ) - builder.append( getDetailForGroup( argumentGroup ) ); - - return builder.toString(); - } - - /** - * Gets a detailed description for a given argument group. - * @param argumentDefinitionGroup The group of argument definitions to render. - * @return A string giving detailed info about the contents of this group. - */ - private String getDetailForGroup( ArgumentDefinitionGroup argumentDefinitionGroup ) { - if(argumentDefinitionGroup.allHidden()) - return ""; - - StringBuilder builder = new StringBuilder(); - Formatter formatter = new Formatter( builder ); - - if( argumentDefinitionGroup.groupName != null && argumentDefinitionGroup.argumentDefinitions.size() != 0 ) - builder.append( String.format("%nArguments for %s:%n", argumentDefinitionGroup.groupName ) ); - - List argumentDefinitions = new ArrayList(); - for(ArgumentDefinition argumentDefinition: argumentDefinitionGroup.argumentDefinitions) { - if(!argumentDefinition.isHidden) - argumentDefinitions.add(argumentDefinition); - } - - // Try to fit the entire argument definition across the screen, but impose an arbitrary cap of 3/4 * - // LINE_WIDTH in case the length of the arguments gets out of control. - int argWidth = Math.min( findLongestArgumentCallingInfo(argumentDefinitions), (TextFormattingUtils.DEFAULT_LINE_WIDTH*3)/4 - FIELD_SEPARATION_WIDTH ); - int docWidth = TextFormattingUtils.DEFAULT_LINE_WIDTH - argWidth - FIELD_SEPARATION_WIDTH; - - for( ArgumentDefinition argumentDefinition: argumentDefinitions ) { - Iterator wordWrappedArgs = TextFormattingUtils.wordWrap( getArgumentCallingInfo(argumentDefinition), argWidth ).iterator(); - Iterator wordWrappedDoc = TextFormattingUtils.wordWrap( getArgumentDoc(argumentDefinition), docWidth ).iterator(); - - while( wordWrappedArgs.hasNext() || wordWrappedDoc.hasNext() ) { - String arg = wordWrappedArgs.hasNext() ? wordWrappedArgs.next() : ""; - String doc = wordWrappedDoc.hasNext() ? wordWrappedDoc.next() : ""; - - String formatString = "%-" + argWidth + "s%" + FIELD_SEPARATION_WIDTH + "s%s%n"; - formatter.format( formatString, arg, "", doc ); - } - } - - return builder.toString(); - } - - /** - * Gets a string indicating how this argument should be passed to the application. - * @param argumentDefinition Argument definition for which help should be printed. - * @return Calling information for this argument. - */ - private String getArgumentCallingInfo( ArgumentDefinition argumentDefinition ) { - StringBuilder builder = new StringBuilder(); - Formatter formatter = new Formatter( builder ); - - formatter.format(" "); - if( argumentDefinition.shortName != null ) - formatter.format("-%s,", argumentDefinition.shortName); - formatter.format("--%s", argumentDefinition.fullName); - if( !argumentDefinition.isFlag ) - formatter.format(" <%s>", argumentDefinition.fullName); - - return builder.toString(); - } - - /** - * Gets a string of argument documentation. - * @param argumentDefinition Argument definition for which help should be printed. - * @return Brief description for this argument. - */ - private String getArgumentDoc( ArgumentDefinition argumentDefinition ) { - StringBuilder builder = new StringBuilder(); - builder.append(argumentDefinition.doc); - if( argumentDefinition.validOptions != null ) { - builder.append(" ("); - builder.append(Utils.join("|",argumentDefinition.validOptions)); - builder.append(")"); - } - return builder.toString(); - } - - /** - * Crude implementation which finds the longest argument portion - * given a set of arguments. - * @param argumentDefinitions argument definitions to inspect. - * @return longest argument length. - */ - private int findLongestArgumentCallingInfo( Collection argumentDefinitions ) { - int longest = 0; - for( ArgumentDefinition argumentDefinition: argumentDefinitions ) { - String argumentText = getArgumentCallingInfo( argumentDefinition ); - if( longest < argumentText.length() ) - longest = argumentText.length(); - } - return longest; - } - - /** - * Extract the argument definition groups from the argument definitions and arrange them appropriately. - * For help, we want the arguments sorted as they are declared in the class. However, required arguments - * should appear before optional arguments. - * @param argumentDefinitions Argument definitions from which to extract argument groups. - * @return A list of argument groups sorted in display order. - */ - private List prepareArgumentGroups( ArgumentDefinitions argumentDefinitions ) { - // Sort the list of argument definitions according to how they should be shown. - // Put the sorted results into a new cloned data structure. - Comparator definitionComparator = new Comparator() { - public int compare( ArgumentDefinition lhs, ArgumentDefinition rhs ) { - if( lhs.required && rhs.required ) return 0; - if( lhs.required ) return -1; - if( rhs.required ) return 1; - return 0; - } - }; - - List argumentGroups = new ArrayList(); - for( ArgumentDefinitionGroup argumentGroup: argumentDefinitions.getArgumentDefinitionGroups() ) { - List sortedDefinitions = new ArrayList( argumentGroup.argumentDefinitions ); - Collections.sort( sortedDefinitions, definitionComparator ); - argumentGroups.add( new ArgumentDefinitionGroup(argumentGroup.groupName,sortedDefinitions) ); - } - - // Sort the argument groups themselves with main arguments first, followed by plugins sorted in name order. - Comparator groupComparator = new Comparator() { - public int compare( ArgumentDefinitionGroup lhs, ArgumentDefinitionGroup rhs ) { - if( lhs.groupName == null && rhs.groupName == null ) return 0; - if( lhs.groupName == null ) return -1; - if( rhs.groupName == null ) return 1; - return lhs.groupName.compareTo(rhs.groupName); - } - }; - Collections.sort( argumentGroups, groupComparator ); - - - return argumentGroups; - } - - /** - * generateHeaderInformation - *

- *

- * Generate a standard header for the logger - * - * @param applicationDetails details of the application to run. - * @param parsedArgs the arguments passed in - */ - public static void generateHeaderInformation(ApplicationDetails applicationDetails, Map parsedArgs) { - - DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss"); - java.util.Date date = new java.util.Date(); - - String barrier = createBarrier(applicationDetails.applicationHeader); - - logger.info(barrier); - for (String headerLine : applicationDetails.applicationHeader) - logger.info(headerLine); - logger.debug("Current directory: " + System.getProperty("user.dir")); - for (Map.Entry entry: parsedArgs.entrySet()) { - ArgumentMatchSource matchSource = entry.getKey(); - final String sourceName; - switch (matchSource.getType()) { - case CommandLine: sourceName = "Program"; break; - case Provider: sourceName = matchSource.getDescription(); break; - default: throw new RuntimeException("Unexpected argument match source type: " + matchSource.getType()); - } - - String output = sourceName + " Args: " + entry.getValue().getDescription(); - logger.info(output); - } - logger.info("Date/Time: " + dateFormat.format(date)); - logger.info(barrier); - - for(String attribution: applicationDetails.attribution) - logger.info(attribution); - logger.info(barrier); - } - - /** - * Create a barrier to use to distinguish the header from the rest of the output. - * @param text A collection of lines to output as part of a header. - * @return A barrier consisting of the '-' character. - */ - private static String createBarrier(List text) { - int barrierWidth = 0; - for(String headerLine: text) - barrierWidth = Math.max(headerLine.length(),barrierWidth); - return String.format("%0" + barrierWidth + "d",0).replace('0','-'); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java deleted file mode 100644 index b83a15d6d..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java +++ /dev/null @@ -1,236 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.pairhmm; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.QualityUtils; - -import java.util.Arrays; - -import static java.lang.Math.log10; - -/** - * Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book. - * - * User: rpoplin, carneiro - * Date: 3/1/12 - */ -public class Log10PairHMM extends N2MemoryPairHMM { - /** - * Should we use exact log10 calculation (true), or an approximation (false)? - */ - private final boolean doExactLog10; - - protected static final int matchToMatch = 0; - protected static final int indelToMatch = 1; - protected static final int matchToInsertion = 2; - protected static final int insertionToInsertion = 3; - protected static final int matchToDeletion = 4; - protected static final int deletionToDeletion = 5; - - // we divide e by 3 because the observed base could have come from any of the non-observed alleles - protected final static double log10_3 = log10(3.0); - - /** - * Create an uninitialized PairHMM - * - * @param doExactLog10 should the log10 calculations be exact (slow) or approximate (faster) - */ - public Log10PairHMM(final boolean doExactLog10) { - this.doExactLog10 = doExactLog10; - } - - /** - * Is this HMM using exact log10 calculations? - * @return true if exact, false if approximate - */ - public boolean isDoingExactLog10Calculations() { - return doExactLog10; - } - - /** - * {@inheritDoc} - */ - @Override - public void initialize(final int readMaxLength, final int haplotypeMaxLength ) { - super.initialize(readMaxLength, haplotypeMaxLength); - - for( int iii=0; iii < paddedMaxReadLength; iii++ ) { - Arrays.fill(matchMatrix[iii], Double.NEGATIVE_INFINITY); - Arrays.fill(insertionMatrix[iii], Double.NEGATIVE_INFINITY); - Arrays.fill(deletionMatrix[iii], Double.NEGATIVE_INFINITY); - } - } - - /** - * {@inheritDoc} - */ - @Override - public double subComputeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, - final byte[] readBases, - final byte[] readQuals, - final byte[] insertionGOP, - final byte[] deletionGOP, - final byte[] overallGCP, - final int hapStartIndex, - final boolean recacheReadValues, - final int nextHapStartIndex) { - - - if ( ! constantsAreInitialized || recacheReadValues ) - initializeProbabilities(insertionGOP, deletionGOP, overallGCP); - initializePriors(haplotypeBases, readBases, readQuals, hapStartIndex); - if (previousHaplotypeBases == null || previousHaplotypeBases.length != haplotypeBases.length) { - // set the initial value (free deletions in the beginning) for the first row in the deletion matrix - initializeMatrixValues(haplotypeBases); - } - - for (int i = 1; i < paddedReadLength; i++) { - // +1 here is because hapStartIndex is 0-based, but our matrices are 1 based - for (int j = hapStartIndex+1; j < paddedHaplotypeLength; j++) { - updateCell(i, j, prior[i][j], transition[i]); - } - } - - // final probability is the log10 sum of the last element in the Match and Insertion state arrays - // this way we ignore all paths that ended in deletions! (huge) - // but we have to sum all the paths ending in the M and I matrices, because they're no longer extended. - double finalSumProbabilities = finalLikelihoodCalculation(); - - return finalSumProbabilities; - } - - protected void initializeMatrixValues(final byte[] haplotypeBases) { - final double initialValue = Math.log10(1.0 / haplotypeBases.length); - for( int j = 0; j < paddedHaplotypeLength; j++ ) { - deletionMatrix[0][j] = initialValue; - } - } - - protected double finalLikelihoodCalculation() { - final int endI = paddedReadLength - 1; - double finalSumProbabilities = myLog10SumLog10(new double[]{matchMatrix[endI][1], insertionMatrix[endI][1]}); - for (int j = 2; j < paddedHaplotypeLength; j++) - finalSumProbabilities = myLog10SumLog10(new double[]{finalSumProbabilities, matchMatrix[endI][j], insertionMatrix[endI][j]}); - return finalSumProbabilities; - } - - - /** - * Initializes the matrix that holds all the constants related to the editing - * distance between the read and the haplotype. - * - * @param haplotypeBases the bases of the haplotype - * @param readBases the bases of the read - * @param readQuals the base quality scores of the read - * @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read) - */ - public void initializePriors(final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, final int startIndex) { - - // initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases - // Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2. - - for (int i = 0; i < readBases.length; i++) { - final byte x = readBases[i]; - final byte qual = readQuals[i]; - for (int j = startIndex; j < haplotypeBases.length; j++) { - final byte y = haplotypeBases[j]; - prior[i+1][j+1] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? - QualityUtils.qualToProbLog10(qual) : (QualityUtils.qualToErrorProbLog10(qual) - (doNotUseTristateCorrection ? 0.0 : log10_3)) ); - } - } - } - - /** - * Initializes the matrix that holds all the constants related to quality scores. - * - * @param insertionGOP insertion quality scores of the read - * @param deletionGOP deletion quality scores of the read - * @param overallGCP overall gap continuation penalty - */ - @Requires({ - "insertionGOP != null", - "deletionGOP != null", - "overallGCP != null" - }) - @Ensures("constantsAreInitialized") - protected void initializeProbabilities(final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP) { - for (int i = 0; i < insertionGOP.length; i++) { - final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE); - transition[i+1][matchToMatch] = QualityUtils.qualToProbLog10((byte) qualIndexGOP); - transition[i+1][indelToMatch] = QualityUtils.qualToProbLog10(overallGCP[i]); - transition[i+1][matchToInsertion] = QualityUtils.qualToErrorProbLog10(insertionGOP[i]); - transition[i+1][insertionToInsertion] = QualityUtils.qualToErrorProbLog10(overallGCP[i]); - transition[i+1][matchToDeletion] = QualityUtils.qualToErrorProbLog10(deletionGOP[i]); - transition[i+1][deletionToDeletion] = QualityUtils.qualToErrorProbLog10(overallGCP[i]); - } - - // note that we initialized the constants - constantsAreInitialized = true; - } - - - /** - * Compute the log10SumLog10 of the values - * - * NOTE NOTE NOTE - * - * Log10PairHMM depends critically on this function tolerating values that are all -Infinity - * and the sum returning -Infinity. Note good. Needs to be fixed. - * - * NOTE NOTE NOTE - * - * @param values an array of log10 probabilities that need to be summed - * @return the log10 of the sum of the probabilities - */ - @Requires("values != null") - protected double myLog10SumLog10(final double[] values) { - return doExactLog10 ? MathUtils.log10sumLog10(values) : MathUtils.approximateLog10SumLog10(values); - } - - /** - * Updates a cell in the HMM matrix - * - * The read and haplotype indices are offset by one because the state arrays have an extra column to hold the - * initial conditions - - * @param indI row index in the matrices to update - * @param indJ column index in the matrices to update - * @param prior the likelihood editing distance matrix for the read x haplotype - * @param transition an array with the six transition relevant to this location - */ - protected void updateCell( final int indI, final int indJ, final double prior, final double[] transition) { - - matchMatrix[indI][indJ] = prior + - myLog10SumLog10(new double[]{matchMatrix[indI - 1][indJ - 1] + transition[matchToMatch], - insertionMatrix[indI - 1][indJ - 1] + transition[indelToMatch], - deletionMatrix[indI - 1][indJ - 1] + transition[indelToMatch]}); - insertionMatrix[indI][indJ] = myLog10SumLog10(new double[] {matchMatrix[indI - 1][indJ] + transition[matchToInsertion], insertionMatrix[indI - 1][indJ] + transition[insertionToInsertion]}); - deletionMatrix[indI][indJ] = myLog10SumLog10(new double[] {matchMatrix[indI][indJ - 1] + transition[matchToDeletion], deletionMatrix[indI][indJ - 1] + transition[deletionToDeletion]}); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/N2MemoryPairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/N2MemoryPairHMM.java deleted file mode 100644 index 18cb9054b..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/pairhmm/N2MemoryPairHMM.java +++ /dev/null @@ -1,97 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.pairhmm; - -import com.google.java.contract.Requires; - -/** - * Superclass for PairHMM that want to use a full read x haplotype matrix for their match, insertion, and deletion matrix - * - * User: rpoplin - * Date: 10/16/12 - */ -abstract class N2MemoryPairHMM extends PairHMM { - protected double[][] transition = null; // The transition probabilities cache - protected double[][] prior = null; // The prior probabilities cache - protected double[][] matchMatrix = null; - protected double[][] insertionMatrix = null; - protected double[][] deletionMatrix = null; - - // only used for debugging purposes - protected boolean doNotUseTristateCorrection = false; - - public void doNotUseTristateCorrection() { - doNotUseTristateCorrection = true; - } - - /** - * Initialize this PairHMM, making it suitable to run against a read and haplotype with given lengths - * - * Note: Do not worry about padding, just provide the true max length of the read and haplotype. The HMM will take care of the padding. - * - * @param haplotypeMaxLength the max length of haplotypes we want to use with this PairHMM - * @param readMaxLength the max length of reads we want to use with this PairHMM - */ - public void initialize( final int readMaxLength, final int haplotypeMaxLength ) { - super.initialize(readMaxLength, haplotypeMaxLength); - - matchMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; - insertionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; - deletionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; - - transition = new double[paddedMaxReadLength][6]; - prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; - } - - /** - * Print out the core hmm matrices for debugging - */ - protected void dumpMatrices() { - dumpMatrix("matchMetricArray", matchMatrix); - dumpMatrix("insertionMatrix", insertionMatrix); - dumpMatrix("deletionMatrix", deletionMatrix); - } - - /** - * Print out in a human readable form the matrix for debugging - * @param name the name of this matrix - * @param matrix the matrix of values - */ - @Requires({"name != null", "matrix != null"}) - private void dumpMatrix(final String name, final double[][] matrix) { - System.out.printf("%s%n", name); - for ( int i = 0; i < matrix.length; i++) { - System.out.printf("\t%s[%d]", name, i); - for ( int j = 0; j < matrix[i].length; j++ ) { - if ( Double.isInfinite(matrix[i][j]) ) - System.out.printf(" %15s", String.format("%f", matrix[i][j])); - else - System.out.printf(" % 15.5e", matrix[i][j]); - } - System.out.println(); - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java deleted file mode 100644 index 254945af4..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java +++ /dev/null @@ -1,327 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.pairhmm; - -import com.google.java.contract.Requires; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.variant.variantcontext.Allele; - -import java.util.Arrays; -import java.util.List; -import java.util.Map; -/** - * Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book. - * - * User: rpoplin - * Date: 10/16/12 - */ -public abstract class PairHMM { - protected final static Logger logger = Logger.getLogger(PairHMM.class); - - protected boolean constantsAreInitialized = false; - - protected byte[] previousHaplotypeBases; - protected int hapStartIndex; - - public enum HMM_IMPLEMENTATION { - /* Very slow implementation which uses very accurate log10 sum functions. Only meant to be used as a reference test implementation */ - EXACT, - /* PairHMM as implemented for the UnifiedGenotyper. Uses log10 sum functions accurate to only 1E-4 */ - ORIGINAL, - /* Optimized version of the PairHMM which caches per-read computations and operations in real space to avoid costly sums of log10'ed likelihoods */ - LOGLESS_CACHING, - /* Optimized AVX implementation of LOGLESS_CACHING called through JNI */ - VECTOR_LOGLESS_CACHING, - /* Debugging for vector implementation of LOGLESS_CACHING */ - DEBUG_VECTOR_LOGLESS_CACHING, - /* Logless caching PairHMM that stores computations in 1D arrays instead of matrices, and which proceeds diagonally over the (read x haplotype) intersection matrix */ - ARRAY_LOGLESS - } - - protected int maxHaplotypeLength, maxReadLength; - protected int paddedMaxReadLength, paddedMaxHaplotypeLength; - protected int paddedReadLength, paddedHaplotypeLength; - protected boolean initialized = false; - - // only used for debugging purposes - protected boolean doNotUseTristateCorrection = false; - protected void doNotUseTristateCorrection() { doNotUseTristateCorrection = true; } - - //debug array - protected double[] mLikelihoodArray; - - //profiling information - protected static final boolean doProfiling = true; - protected long computeTime = 0; - protected long startTime = 0; - - /** - * Initialize this PairHMM, making it suitable to run against a read and haplotype with given lengths - * - * Note: Do not worry about padding, just provide the true max length of the read and haplotype. The HMM will take care of the padding. - * - * @param haplotypeMaxLength the max length of haplotypes we want to use with this PairHMM - * @param readMaxLength the max length of reads we want to use with this PairHMM - */ - public void initialize( final int readMaxLength, final int haplotypeMaxLength ) { - if ( readMaxLength <= 0 ) throw new IllegalArgumentException("READ_MAX_LENGTH must be > 0 but got " + readMaxLength); - if ( haplotypeMaxLength <= 0 ) throw new IllegalArgumentException("HAPLOTYPE_MAX_LENGTH must be > 0 but got " + haplotypeMaxLength); - - maxHaplotypeLength = haplotypeMaxLength; - maxReadLength = readMaxLength; - - // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment - paddedMaxReadLength = readMaxLength + 1; - paddedMaxHaplotypeLength = haplotypeMaxLength + 1; - - previousHaplotypeBases = null; - - constantsAreInitialized = false; - initialized = true; - } - - /** - * Called at the end of PairHMM for a region - mostly used by the JNI implementations - */ - public void finalizeRegion() - { - ; - } - - /** - * Initialize this PairHMM, making it suitable to run against a read and haplotype with given lengths - * This function is used by the JNI implementations to transfer all data once to the native code - * @param haplotypes the list of haplotypes - * @param perSampleReadList map from sample name to list of reads - * @param haplotypeMaxLength the max length of haplotypes we want to use with this PairHMM - * @param readMaxLength the max length of reads we want to use with this PairHMM - */ - public void initialize( final List haplotypes, final Map> perSampleReadList, final int readMaxLength, final int haplotypeMaxLength ) { - initialize(readMaxLength, haplotypeMaxLength); - } - - protected int findMaxReadLength(final List reads) { - int listMaxReadLength = 0; - for(GATKSAMRecord read : reads){ - final int readLength = read.getReadLength(); - if( readLength > listMaxReadLength ) { listMaxReadLength = readLength; } - } - return listMaxReadLength; - } - - protected int findMaxHaplotypeLength(final Map haplotypeMap) { - int listMaxHaplotypeLength = 0; - for( final Allele a: haplotypeMap.keySet() ) { - final Haplotype h = haplotypeMap.get(a); - final int haplotypeLength = h.getBases().length; - if( haplotypeLength > listMaxHaplotypeLength ) { listMaxHaplotypeLength = haplotypeLength; } - } - return listMaxHaplotypeLength; - } - - /** - * Given a list of reads and haplotypes, for every read compute the total probability of said read arising from - * each haplotype given base substitution, insertion, and deletion probabilities. - * - * @param reads the list of reads - * @param alleleHaplotypeMap the list of haplotypes - * @param GCPArrayMap Each read is associated with an array containing the gap continuation penalties for use in the model. Length of each GCP-array must match that of its read. - * @return a PerReadAlleleLikelihoodMap containing each read, haplotype-allele, and the log10 probability of - * said read coming from the said haplotype under the provided error model - */ - public PerReadAlleleLikelihoodMap computeLikelihoods(final List reads, final Map alleleHaplotypeMap, final Map GCPArrayMap) { - if(doProfiling) - startTime = System.nanoTime(); - - // (re)initialize the pairHMM only if necessary - final int readMaxLength = findMaxReadLength(reads); - final int haplotypeMaxLength = findMaxHaplotypeLength(alleleHaplotypeMap); - if (!initialized || readMaxLength > maxReadLength || haplotypeMaxLength > maxHaplotypeLength) { initialize(readMaxLength, haplotypeMaxLength); } - - final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap(); - mLikelihoodArray = new double[reads.size()*alleleHaplotypeMap.size()]; - int idx = 0; - for(GATKSAMRecord read : reads){ - final byte[] readBases = read.getReadBases(); - final byte[] readQuals = read.getBaseQualities(); - final byte[] readInsQuals = read.getBaseInsertionQualities(); - final byte[] readDelQuals = read.getBaseDeletionQualities(); - final byte[] overallGCP = GCPArrayMap.get(read); - - // peak at the next haplotype in the list (necessary to get nextHaplotypeBases, which is required for caching in the array implementation) - byte[] currentHaplotypeBases = null; - boolean isFirstHaplotype = true; - Allele currentAllele = null; - double log10l; - //for (final Allele allele : alleleHaplotypeMap.keySet()){ - for (Map.Entry currEntry : alleleHaplotypeMap.entrySet()){ - //final Haplotype haplotype = alleleHaplotypeMap.get(allele); - final Allele allele = currEntry.getKey(); - final Haplotype haplotype = currEntry.getValue(); - final byte[] nextHaplotypeBases = haplotype.getBases(); - if (currentHaplotypeBases != null) { - log10l = computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases, - readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, nextHaplotypeBases); - mLikelihoodArray[idx++] = log10l; - likelihoodMap.add(read, currentAllele, log10l); - } - // update the current haplotype - currentHaplotypeBases = nextHaplotypeBases; - currentAllele = allele; - } - // process the final haplotype - if (currentHaplotypeBases != null) { - - // there is no next haplotype, so pass null for nextHaplotypeBases. - log10l = computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases, - readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, null); - likelihoodMap.add(read, currentAllele, log10l); - mLikelihoodArray[idx++] = log10l; - } - } - if(doProfiling) - computeTime += (System.nanoTime() - startTime); - return likelihoodMap; - } - - /** - * Compute the total probability of read arising from haplotypeBases given base substitution, insertion, and deletion - * probabilities. - * - * Note on using hapStartIndex. This allows you to compute the exact true likelihood of a full haplotypes - * given a read, assuming that the previous calculation read over a full haplotype, recaching the read values, - * starting only at the place where the new haplotype bases and the previous haplotype bases different. This - * index is 0-based, and can be computed with findFirstPositionWhereHaplotypesDiffer given the two haplotypes. - * Note that this assumes that the read and all associated quals values are the same. - * - * @param haplotypeBases the full sequence (in standard SAM encoding) of the haplotype, must be >= than read bases in length - * @param readBases the bases (in standard encoding) of the read, must be <= haplotype bases in length - * @param readQuals the phred-scaled per base substitution quality scores of read. Must be the same length as readBases - * @param insertionGOP the phred-scaled per base insertion quality scores of read. Must be the same length as readBases - * @param deletionGOP the phred-scaled per base deletion quality scores of read. Must be the same length as readBases - * @param overallGCP the phred-scaled gap continuation penalties scores of read. Must be the same length as readBases - * @param recacheReadValues if false, we don't recalculate any cached results, assuming that readBases and its associated - * parameters are the same, and only the haplotype bases are changing underneath us - * @return the log10 probability of read coming from the haplotype under the provided error model - */ - protected final double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, - final byte[] readBases, - final byte[] readQuals, - final byte[] insertionGOP, - final byte[] deletionGOP, - final byte[] overallGCP, - final boolean recacheReadValues, - final byte[] nextHaploytpeBases) { - - if ( ! initialized ) throw new IllegalStateException("Must call initialize before calling computeReadLikelihoodGivenHaplotypeLog10"); - if ( haplotypeBases == null ) throw new IllegalArgumentException("haplotypeBases cannot be null"); - if ( haplotypeBases.length > maxHaplotypeLength ) throw new IllegalArgumentException("Haplotype bases is too long, got " + haplotypeBases.length + " but max is " + maxHaplotypeLength); - if ( readBases == null ) throw new IllegalArgumentException("readBases cannot be null"); - if ( readBases.length > maxReadLength ) throw new IllegalArgumentException("readBases is too long, got " + readBases.length + " but max is " + maxReadLength); - if ( readQuals.length != readBases.length ) throw new IllegalArgumentException("Read bases and read quals aren't the same size: " + readBases.length + " vs " + readQuals.length); - if ( insertionGOP.length != readBases.length ) throw new IllegalArgumentException("Read bases and read insertion quals aren't the same size: " + readBases.length + " vs " + insertionGOP.length); - if ( deletionGOP.length != readBases.length ) throw new IllegalArgumentException("Read bases and read deletion quals aren't the same size: " + readBases.length + " vs " + deletionGOP.length); - if ( overallGCP.length != readBases.length ) throw new IllegalArgumentException("Read bases and overall GCP aren't the same size: " + readBases.length + " vs " + overallGCP.length); - - paddedReadLength = readBases.length + 1; - paddedHaplotypeLength = haplotypeBases.length + 1; - - hapStartIndex = (recacheReadValues) ? 0 : hapStartIndex; - - // Pre-compute the difference between the current haplotype and the next one to be run - // Looking ahead is necessary for the ArrayLoglessPairHMM implementation - final int nextHapStartIndex = (nextHaploytpeBases == null || haplotypeBases.length != nextHaploytpeBases.length) ? 0 : findFirstPositionWhereHaplotypesDiffer(haplotypeBases, nextHaploytpeBases); - - double result = subComputeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, hapStartIndex, recacheReadValues, nextHapStartIndex); - - if ( ! MathUtils.goodLog10Probability(result) ) - throw new IllegalStateException("PairHMM Log Probability cannot be greater than 0: " + String.format("haplotype: %s, read: %s, result: %f", Arrays.toString(haplotypeBases), Arrays.toString(readBases), result)); - - // Warning: Careful if using the PairHMM in parallel! (this update has to be taken care of). - // Warning: This assumes no downstream modification of the haplotype bases (saves us from copying the array). It is okay for the haplotype caller and the Unified Genotyper. - previousHaplotypeBases = haplotypeBases; - - // For the next iteration, the hapStartIndex for the next haploytpe becomes the index for the current haplotype - // The array implementation has to look ahead to the next haplotype to store caching info. It cannot do this if nextHapStart is before hapStart - hapStartIndex = (nextHapStartIndex < hapStartIndex) ? 0: nextHapStartIndex; - - return result; - } - - /** - * To be overloaded by subclasses to actually do calculation for #computeReadLikelihoodGivenHaplotypeLog10 - */ - @Requires({"readBases.length == readQuals.length", "readBases.length == insertionGOP.length", "readBases.length == deletionGOP.length", - "readBases.length == overallGCP.length", "matchMatrix!=null", "insertionMatrix!=null", "deletionMatrix!=null"}) - protected abstract double subComputeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, - final byte[] readBases, - final byte[] readQuals, - final byte[] insertionGOP, - final byte[] deletionGOP, - final byte[] overallGCP, - final int hapStartIndex, - final boolean recacheReadValues, - final int nextHapStartIndex); - - /** - * Compute the first position at which two haplotypes differ - * - * If the haplotypes are exact copies of each other, returns the min length of the two haplotypes. - * - * @param haplotype1 the first haplotype1 - * @param haplotype2 the second haplotype1 - * @return the index of the first position in haplotype1 and haplotype2 where the byte isn't the same - */ - public static int findFirstPositionWhereHaplotypesDiffer(final byte[] haplotype1, final byte[] haplotype2) { - if ( haplotype1 == null || haplotype1.length == 0 ) throw new IllegalArgumentException("Haplotype1 is bad " + Arrays.toString(haplotype1)); - if ( haplotype2 == null || haplotype2.length == 0 ) throw new IllegalArgumentException("Haplotype2 is bad " + Arrays.toString(haplotype2)); - - for( int iii = 0; iii < haplotype1.length && iii < haplotype2.length; iii++ ) { - if( haplotype1[iii] != haplotype2[iii] ) { - return iii; - } - } - - return Math.min(haplotype1.length, haplotype2.length); - } - - /** - * Return the results of the computeLikelihoods function - */ - public double[] getLikelihoodArray() { return mLikelihoodArray; } - /** - * Called at the end of the program to close files, print profiling information etc - */ - public void close() - { - if(doProfiling) - System.out.println("Total compute time in PairHMM computeLikelihoods() : "+(computeTime*1e-9)); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java deleted file mode 100644 index c36d7f888..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java +++ /dev/null @@ -1,2120 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.variant; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.apache.commons.lang.ArrayUtils; -import org.apache.log4j.Logger; -import org.broad.tribble.TribbleException; -import org.broad.tribble.util.popgen.HardyWeinbergCalculation; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.variant.variantcontext.*; -import org.broadinstitute.variant.vcf.VCFConstants; - -import java.io.Serializable; -import java.util.*; - -public class GATKVariantContextUtils { - - private static Logger logger = Logger.getLogger(GATKVariantContextUtils.class); - - public static final int DEFAULT_PLOIDY = 2; - public static final double SUM_GL_THRESH_NOCALL = -0.1; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call. - - public final static List NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); - public final static String NON_REF_SYMBOLIC_ALLELE_NAME = "NON_REF"; - public final static Allele NON_REF_SYMBOLIC_ALLELE = Allele.create("<"+NON_REF_SYMBOLIC_ALLELE_NAME+">", false); // represents any possible non-ref allele at this site - - public final static String MERGE_FILTER_PREFIX = "filterIn"; - public final static String MERGE_REF_IN_ALL = "ReferenceInAll"; - public final static String MERGE_FILTER_IN_ALL = "FilteredInAll"; - public final static String MERGE_INTERSECTION = "Intersection"; - - public enum GenotypeMergeType { - /** - * Make all sample genotypes unique by file. Each sample shared across RODs gets named sample.ROD. - */ - UNIQUIFY, - /** - * Take genotypes in priority order (see the priority argument). - */ - PRIORITIZE, - /** - * Take the genotypes in any order. - */ - UNSORTED, - /** - * Require that all samples/genotypes be unique between all inputs. - */ - REQUIRE_UNIQUE - } - - public enum FilteredRecordMergeType { - /** - * Union - leaves the record if any record is unfiltered. - */ - KEEP_IF_ANY_UNFILTERED, - /** - * Requires all records present at site to be unfiltered. VCF files that don't contain the record don't influence this. - */ - KEEP_IF_ALL_UNFILTERED, - /** - * If any record is present at this site (regardless of possibly being filtered), then all such records are kept and the filters are reset. - */ - KEEP_UNCONDITIONAL - } - - public enum MultipleAllelesMergeType { - /** - * Combine only alleles of the same type (SNP, indel, etc.) into a single VCF record. - */ - BY_TYPE, - /** - * Merge all allele types at the same start position into the same VCF record. - */ - MIX_TYPES - } - - /** - * Refactored out of the AverageAltAlleleLength annotation class - * @param vc the variant context - * @return the average length of the alt allele (a double) - */ - public static double getMeanAltAlleleLength(VariantContext vc) { - double averageLength = 1.0; - if ( ! vc.isSNP() && ! vc.isSymbolic() ) { - // adjust for the event length - int averageLengthNum = 0; - int averageLengthDenom = 0; - int refLength = vc.getReference().length(); - for ( final Allele a : vc.getAlternateAlleles() ) { - int numAllele = vc.getCalledChrCount(a); - int alleleSize; - if ( a.length() == refLength ) { - // SNP or MNP - byte[] a_bases = a.getBases(); - byte[] ref_bases = vc.getReference().getBases(); - int n_mismatch = 0; - for ( int idx = 0; idx < a_bases.length; idx++ ) { - if ( a_bases[idx] != ref_bases[idx] ) - n_mismatch++; - } - alleleSize = n_mismatch; - } - else if ( a.isSymbolic() ) { - alleleSize = 1; - } else { - alleleSize = Math.abs(refLength-a.length()); - } - averageLengthNum += alleleSize*numAllele; - averageLengthDenom += numAllele; - } - averageLength = ( (double) averageLengthNum )/averageLengthDenom; - } - - return averageLength; - } - - /** - * create a genome location, given a variant context - * @param genomeLocParser parser - * @param vc the variant context - * @return the genomeLoc - */ - public static final GenomeLoc getLocation(GenomeLocParser genomeLocParser,VariantContext vc) { - return genomeLocParser.createGenomeLoc(vc.getChr(), vc.getStart(), vc.getEnd(), true); - } - - public static BaseUtils.BaseSubstitutionType getSNPSubstitutionType(VariantContext context) { - if (!context.isSNP() || !context.isBiallelic()) - throw new IllegalStateException("Requested SNP substitution type for bialleic non-SNP " + context); - return BaseUtils.SNPSubstitutionType(context.getReference().getBases()[0], context.getAlternateAllele(0).getBases()[0]); - } - - /** - * If this is a BiAllelic SNP, is it a transition? - */ - public static boolean isTransition(VariantContext context) { - return getSNPSubstitutionType(context) == BaseUtils.BaseSubstitutionType.TRANSITION; - } - - /** - * If this is a BiAllelic SNP, is it a transversion? - */ - public static boolean isTransversion(VariantContext context) { - return getSNPSubstitutionType(context) == BaseUtils.BaseSubstitutionType.TRANSVERSION; - } - - public static boolean isTransition(Allele ref, Allele alt) { - return BaseUtils.SNPSubstitutionType(ref.getBases()[0], alt.getBases()[0]) == BaseUtils.BaseSubstitutionType.TRANSITION; - } - - public static boolean isTransversion(Allele ref, Allele alt) { - return BaseUtils.SNPSubstitutionType(ref.getBases()[0], alt.getBases()[0]) == BaseUtils.BaseSubstitutionType.TRANSVERSION; - } - - /** - * Returns a context identical to this with the REF and ALT alleles reverse complemented. - * - * @param vc variant context - * @return new vc - */ - public static VariantContext reverseComplement(VariantContext vc) { - // create a mapping from original allele to reverse complemented allele - HashMap alleleMap = new HashMap<>(vc.getAlleles().size()); - for ( final Allele originalAllele : vc.getAlleles() ) { - Allele newAllele; - if ( originalAllele.isNoCall() ) - newAllele = originalAllele; - else - newAllele = Allele.create(BaseUtils.simpleReverseComplement(originalAllele.getBases()), originalAllele.isReference()); - alleleMap.put(originalAllele, newAllele); - } - - // create new Genotype objects - GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); - for ( final Genotype genotype : vc.getGenotypes() ) { - List newAlleles = new ArrayList<>(); - for ( final Allele allele : genotype.getAlleles() ) { - Allele newAllele = alleleMap.get(allele); - if ( newAllele == null ) - newAllele = Allele.NO_CALL; - newAlleles.add(newAllele); - } - newGenotypes.add(new GenotypeBuilder(genotype).alleles(newAlleles).make()); - } - - return new VariantContextBuilder(vc).alleles(alleleMap.values()).genotypes(newGenotypes).make(); - } - - /** - * Returns true iff VC is an non-complex indel where every allele represents an expansion or - * contraction of a series of identical bases in the reference. - * - * For example, suppose the ref bases are CTCTCTGA, which includes a 3x repeat of CTCTCT - * - * If VC = -/CT, then this function returns true because the CT insertion matches exactly the - * upcoming reference. - * If VC = -/CTA then this function returns false because the CTA isn't a perfect match - * - * Now consider deletions: - * - * If VC = CT/- then again the same logic applies and this returns true - * The case of CTA/- makes no sense because it doesn't actually match the reference bases. - * - * The logic of this function is pretty simple. Take all of the non-null alleles in VC. For - * each insertion allele of n bases, check if that allele matches the next n reference bases. - * For each deletion allele of n bases, check if this matches the reference bases at n - 2 n, - * as it must necessarily match the first n bases. If this test returns true for all - * alleles you are a tandem repeat, otherwise you are not. - * - * @param vc - * @param refBasesStartingAtVCWithPad not this is assumed to include the PADDED reference - * @return - */ - @Requires({"vc != null", "refBasesStartingAtVCWithPad != null && refBasesStartingAtVCWithPad.length > 0"}) - public static boolean isTandemRepeat(final VariantContext vc, final byte[] refBasesStartingAtVCWithPad) { - final String refBasesStartingAtVCWithoutPad = new String(refBasesStartingAtVCWithPad).substring(1); - if ( ! vc.isIndel() ) // only indels are tandem repeats - return false; - - final Allele ref = vc.getReference(); - - for ( final Allele allele : vc.getAlternateAlleles() ) { - if ( ! isRepeatAllele(ref, allele, refBasesStartingAtVCWithoutPad) ) - return false; - } - - // we've passed all of the tests, so we are a repeat - return true; - } - - /** - * - * @param vc - * @param refBasesStartingAtVCWithPad - * @return - */ - @Requires({"vc != null", "refBasesStartingAtVCWithPad != null && refBasesStartingAtVCWithPad.length > 0"}) - public static Pair,byte[]> getNumTandemRepeatUnits(final VariantContext vc, final byte[] refBasesStartingAtVCWithPad) { - final boolean VERBOSE = false; - final String refBasesStartingAtVCWithoutPad = new String(refBasesStartingAtVCWithPad).substring(1); - if ( ! vc.isIndel() ) // only indels are tandem repeats - return null; - - final Allele refAllele = vc.getReference(); - final byte[] refAlleleBases = Arrays.copyOfRange(refAllele.getBases(), 1, refAllele.length()); - - byte[] repeatUnit = null; - final ArrayList lengths = new ArrayList<>(); - - for ( final Allele allele : vc.getAlternateAlleles() ) { - Pair result = getNumTandemRepeatUnits(refAlleleBases, Arrays.copyOfRange(allele.getBases(), 1, allele.length()), refBasesStartingAtVCWithoutPad.getBytes()); - - final int[] repetitionCount = result.first; - // repetition count = 0 means allele is not a tandem expansion of context - if (repetitionCount[0] == 0 || repetitionCount[1] == 0) - return null; - - if (lengths.size() == 0) { - lengths.add(repetitionCount[0]); // add ref allele length only once - } - lengths.add(repetitionCount[1]); // add this alt allele's length - - repeatUnit = result.second; - if (VERBOSE) { - System.out.println("RefContext:"+refBasesStartingAtVCWithoutPad); - System.out.println("Ref:"+refAllele.toString()+" Count:" + String.valueOf(repetitionCount[0])); - System.out.println("Allele:"+allele.toString()+" Count:" + String.valueOf(repetitionCount[1])); - System.out.println("RU:"+new String(repeatUnit)); - } - } - - return new Pair, byte[]>(lengths,repeatUnit); - } - - public static Pair getNumTandemRepeatUnits(final byte[] refBases, final byte[] altBases, final byte[] remainingRefContext) { - /* we can't exactly apply same logic as in basesAreRepeated() to compute tandem unit and number of repeated units. - Consider case where ref =ATATAT and we have an insertion of ATAT. Natural description is (AT)3 -> (AT)2. - */ - - byte[] longB; - // find first repeat unit based on either ref or alt, whichever is longer - if (altBases.length > refBases.length) - longB = altBases; - else - longB = refBases; - - // see if non-null allele (either ref or alt, whichever is longer) can be decomposed into several identical tandem units - // for example, -*,CACA needs to first be decomposed into (CA)2 - final int repeatUnitLength = findRepeatedSubstring(longB); - final byte[] repeatUnit = Arrays.copyOf(longB, repeatUnitLength); - - final int[] repetitionCount = new int[2]; - // look for repetitions forward on the ref bases (i.e. starting at beginning of ref bases) - int repetitionsInRef = findNumberofRepetitions(repeatUnit,refBases, true); - repetitionCount[0] = findNumberofRepetitions(repeatUnit, ArrayUtils.addAll(refBases, remainingRefContext), true)-repetitionsInRef; - repetitionCount[1] = findNumberofRepetitions(repeatUnit, ArrayUtils.addAll(altBases, remainingRefContext), true)-repetitionsInRef; - - return new Pair<>(repetitionCount, repeatUnit); - - } - - /** - * Find out if a string can be represented as a tandem number of substrings. - * For example ACTACT is a 2-tandem of ACT, - * but ACTACA is not. - * - * @param bases String to be tested - * @return Length of repeat unit, if string can be represented as tandem of substring (if it can't - * be represented as one, it will be just the length of the input string) - */ - public static int findRepeatedSubstring(byte[] bases) { - - int repLength; - for (repLength=1; repLength <=bases.length; repLength++) { - final byte[] candidateRepeatUnit = Arrays.copyOf(bases,repLength); - boolean allBasesMatch = true; - for (int start = repLength; start < bases.length; start += repLength ) { - // check that remaining of string is exactly equal to repeat unit - final byte[] basePiece = Arrays.copyOfRange(bases,start,start+candidateRepeatUnit.length); - if (!Arrays.equals(candidateRepeatUnit, basePiece)) { - allBasesMatch = false; - break; - } - } - if (allBasesMatch) - return repLength; - } - - return repLength; - } - - /** - * Helper routine that finds number of repetitions a string consists of. - * For example, for string ATAT and repeat unit AT, number of repetitions = 2 - * @param repeatUnit Substring - * @param testString String to test - * @oaram lookForward Look for repetitions forward (at beginning of string) or backward (at end of string) - * @return Number of repetitions (0 if testString is not a concatenation of n repeatUnit's - */ - public static int findNumberofRepetitions(byte[] repeatUnit, byte[] testString, boolean lookForward) { - int numRepeats = 0; - if (lookForward) { - // look forward on the test string - for (int start = 0; start < testString.length; start += repeatUnit.length) { - int end = start + repeatUnit.length; - byte[] unit = Arrays.copyOfRange(testString,start, end); - if(Arrays.equals(unit,repeatUnit)) - numRepeats++; - else - break; - } - return numRepeats; - } - - // look backward. For example, if repeatUnit = AT and testString = GATAT, number of repeat units is still 2 - // look forward on the test string - for (int start = testString.length - repeatUnit.length; start >= 0; start -= repeatUnit.length) { - int end = start + repeatUnit.length; - byte[] unit = Arrays.copyOfRange(testString,start, end); - if(Arrays.equals(unit,repeatUnit)) - numRepeats++; - else - break; - } - return numRepeats; - } - - /** - * Helper function for isTandemRepeat that checks that allele matches somewhere on the reference - * @param ref - * @param alt - * @param refBasesStartingAtVCWithoutPad - * @return - */ - protected static boolean isRepeatAllele(final Allele ref, final Allele alt, final String refBasesStartingAtVCWithoutPad) { - if ( ! Allele.oneIsPrefixOfOther(ref, alt) ) - return false; // we require one allele be a prefix of another - - if ( ref.length() > alt.length() ) { // we are a deletion - return basesAreRepeated(ref.getBaseString(), alt.getBaseString(), refBasesStartingAtVCWithoutPad, 2); - } else { // we are an insertion - return basesAreRepeated(alt.getBaseString(), ref.getBaseString(), refBasesStartingAtVCWithoutPad, 1); - } - } - - protected static boolean basesAreRepeated(final String l, final String s, final String ref, final int minNumberOfMatches) { - final String potentialRepeat = l.substring(s.length()); // skip s bases - - for ( int i = 0; i < minNumberOfMatches; i++) { - final int start = i * potentialRepeat.length(); - final int end = (i+1) * potentialRepeat.length(); - if ( ref.length() < end ) - return false; // we ran out of bases to test - final String refSub = ref.substring(start, end); - if ( ! refSub.equals(potentialRepeat) ) - return false; // repeat didn't match, fail - } - - return true; // we passed all tests, we matched - } - - public enum GenotypeAssignmentMethod { - /** - * set all of the genotype GT values to NO_CALL - */ - SET_TO_NO_CALL, - - /** - * Use the subsetted PLs to greedily assigned genotypes - */ - USE_PLS_TO_ASSIGN, - - /** - * Try to match the original GT calls, if at all possible - * - * Suppose I have 3 alleles: A/B/C and the following samples: - * - * original_GT best_match to A/B best_match to A/C - * S1 => A/A A/A A/A - * S2 => A/B A/B A/A - * S3 => B/B B/B A/A - * S4 => B/C A/B A/C - * S5 => C/C A/A C/C - * - * Basically, all alleles not in the subset map to ref. It means that het-alt genotypes - * when split into 2 bi-allelic variants will be het in each, which is good in some cases, - * rather than the undetermined behavior when using the PLs to assign, which could result - * in hom-var or hom-ref for each, depending on the exact PL values. - */ - BEST_MATCH_TO_ORIGINAL, - - /** - * do not even bother changing the GTs - */ - DO_NOT_ASSIGN_GENOTYPES - } - - /** - * subset the Variant Context to the specific set of alleles passed in (pruning the PLs appropriately) - * - * @param vc variant context with genotype likelihoods - * @param allelesToUse which alleles from the vc are okay to use; *** must be in the same relative order as those in the original VC *** - * @param assignGenotypes assignment strategy for the (subsetted) PLs - * @return a new non-null GenotypesContext - */ - public static GenotypesContext subsetDiploidAlleles(final VariantContext vc, - final List allelesToUse, - final GenotypeAssignmentMethod assignGenotypes) { - if ( allelesToUse.get(0).isNonReference() ) throw new IllegalArgumentException("First allele must be the reference allele"); - if ( allelesToUse.size() == 1 ) throw new IllegalArgumentException("Cannot subset to only 1 alt allele"); - - // optimization: if no input genotypes, just exit - if (vc.getGenotypes().isEmpty()) return GenotypesContext.create(); - - // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward - final List likelihoodIndexesToUse = determineLikelihoodIndexesToUse(vc, allelesToUse); - - // create the new genotypes - return createGenotypesWithSubsettedLikelihoods(vc.getGenotypes(), vc, allelesToUse, likelihoodIndexesToUse, assignGenotypes); - } - - /** - * Figure out which likelihood indexes to use for a selected down set of alleles - * - * @param originalVC the original VariantContext - * @param allelesToUse the subset of alleles to use - * @return a list of PL indexes to use or null if none - */ - private static List determineLikelihoodIndexesToUse(final VariantContext originalVC, final List allelesToUse) { - - // the bitset representing the allele indexes we want to keep - final boolean[] alleleIndexesToUse = getAlleleIndexBitset(originalVC, allelesToUse); - - // an optimization: if we are supposed to use all (or none in the case of a ref call) of the alleles, - // then we can keep the PLs as is; otherwise, we determine which ones to keep - if ( MathUtils.countOccurrences(true, alleleIndexesToUse) == alleleIndexesToUse.length ) - return null; - - return getLikelihoodIndexes(originalVC, alleleIndexesToUse); - } - - /** - * Get the actual likelihoods indexes to use given the corresponding allele indexes - * - * @param originalVC the original VariantContext - * @param alleleIndexesToUse the bitset representing the alleles to use (@see #getAlleleIndexBitset) - * @return a non-null List - */ - private static List getLikelihoodIndexes(final VariantContext originalVC, final boolean[] alleleIndexesToUse) { - - final List result = new ArrayList<>(30); - - // numLikelihoods takes total # of alleles. Use default # of chromosomes (ploidy) = 2 - final int numLikelihoods = GenotypeLikelihoods.numLikelihoods(originalVC.getNAlleles(), DEFAULT_PLOIDY); - - for ( int PLindex = 0; PLindex < numLikelihoods; PLindex++ ) { - final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); - // consider this entry only if both of the alleles are good - if ( alleleIndexesToUse[alleles.alleleIndex1] && alleleIndexesToUse[alleles.alleleIndex2] ) - result.add(PLindex); - } - - return result; - } - - /** - * Given an original VariantContext and a list of alleles from that VC to keep, - * returns a bitset representing which allele indexes should be kept - * - * @param originalVC the original VC - * @param allelesToKeep the list of alleles to keep - * @return non-null bitset - */ - private static boolean[] getAlleleIndexBitset(final VariantContext originalVC, final List allelesToKeep) { - final int numOriginalAltAlleles = originalVC.getNAlleles() - 1; - final boolean[] alleleIndexesToKeep = new boolean[numOriginalAltAlleles + 1]; - - // the reference Allele is definitely still used - alleleIndexesToKeep[0] = true; - for ( int i = 0; i < numOriginalAltAlleles; i++ ) { - if ( allelesToKeep.contains(originalVC.getAlternateAllele(i)) ) - alleleIndexesToKeep[i+1] = true; - } - - return alleleIndexesToKeep; - } - - /** - * Create the new GenotypesContext with the subsetted PLs - * - * @param originalGs the original GenotypesContext - * @param vc the original VariantContext - * @param allelesToUse the actual alleles to use with the new Genotypes - * @param likelihoodIndexesToUse the indexes in the PL to use given the allelesToUse (@see #determineLikelihoodIndexesToUse()) - * @param assignGenotypes assignment strategy for the (subsetted) PLs - * @return a new non-null GenotypesContext - */ - private static GenotypesContext createGenotypesWithSubsettedLikelihoods(final GenotypesContext originalGs, - final VariantContext vc, - final List allelesToUse, - final List likelihoodIndexesToUse, - final GenotypeAssignmentMethod assignGenotypes) { - // the new genotypes to create - final GenotypesContext newGTs = GenotypesContext.create(originalGs.size()); - - // make sure we are seeing the expected number of likelihoods per sample - final int expectedNumLikelihoods = GenotypeLikelihoods.numLikelihoods(vc.getNAlleles(), 2); - - // the samples - final List sampleIndices = originalGs.getSampleNamesOrderedByName(); - - // create the new genotypes - for ( int k = 0; k < originalGs.size(); k++ ) { - final Genotype g = originalGs.get(sampleIndices.get(k)); - final GenotypeBuilder gb = new GenotypeBuilder(g); - - // create the new likelihoods array from the alleles we are allowed to use - double[] newLikelihoods; - if ( !g.hasLikelihoods() ) { - // we don't have any likelihoods, so we null out PLs and make G ./. - newLikelihoods = null; - gb.noPL(); - } else { - final double[] originalLikelihoods = g.getLikelihoods().getAsVector(); - if ( likelihoodIndexesToUse == null ) { - newLikelihoods = originalLikelihoods; - } else if ( originalLikelihoods.length != expectedNumLikelihoods ) { - logger.warn("Wrong number of likelihoods in sample " + g.getSampleName() + " at " + vc + " got " + g.getLikelihoodsString() + " but expected " + expectedNumLikelihoods); - newLikelihoods = null; - } else { - newLikelihoods = new double[likelihoodIndexesToUse.size()]; - int newIndex = 0; - for ( final int oldIndex : likelihoodIndexesToUse ) - newLikelihoods[newIndex++] = originalLikelihoods[oldIndex]; - - // might need to re-normalize - newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true); - } - - if ( newLikelihoods == null || likelihoodsAreUninformative(newLikelihoods) ) - gb.noPL(); - else - gb.PL(newLikelihoods); - } - - updateGenotypeAfterSubsetting(g.getAlleles(), gb, assignGenotypes, newLikelihoods, allelesToUse); - newGTs.add(gb.make()); - } - - return newGTs; - } - - private static boolean likelihoodsAreUninformative(final double[] likelihoods) { - return MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL; - } - - /** - * Add the genotype call (GT) field to GenotypeBuilder using the requested algorithm assignmentMethod - * - * @param originalGT the original genotype calls, cannot be null - * @param gb the builder where we should put our newly called alleles, cannot be null - * @param assignmentMethod the method to use to do the assignment, cannot be null - * @param newLikelihoods a vector of likelihoods to use if the method requires PLs, should be log10 likelihoods, cannot be null - * @param allelesToUse the alleles we are using for our subsetting - */ - public static void updateGenotypeAfterSubsetting(final List originalGT, - final GenotypeBuilder gb, - final GenotypeAssignmentMethod assignmentMethod, - final double[] newLikelihoods, - final List allelesToUse) { - switch ( assignmentMethod ) { - case DO_NOT_ASSIGN_GENOTYPES: - break; - case SET_TO_NO_CALL: - gb.alleles(NO_CALL_ALLELES); - gb.noAD(); - gb.noGQ(); - break; - case USE_PLS_TO_ASSIGN: - gb.noAD(); - if ( newLikelihoods == null || likelihoodsAreUninformative(newLikelihoods) ) { - // if there is no mass on the (new) likelihoods, then just no-call the sample - gb.alleles(NO_CALL_ALLELES); - gb.noGQ(); - } else { - // find the genotype with maximum likelihoods - final int PLindex = MathUtils.maxElementIndex(newLikelihoods); - GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); - gb.alleles(Arrays.asList(allelesToUse.get(alleles.alleleIndex1), allelesToUse.get(alleles.alleleIndex2))); - gb.log10PError(GenotypeLikelihoods.getGQLog10FromLikelihoods(PLindex, newLikelihoods)); - } - break; - case BEST_MATCH_TO_ORIGINAL: - final List best = new LinkedList<>(); - final Allele ref = allelesToUse.get(0); // WARNING -- should be checked in input argument - for ( final Allele originalAllele : originalGT ) { - best.add(allelesToUse.contains(originalAllele) ? originalAllele : ref); - } - gb.noGQ(); - gb.noPL(); - gb.noAD(); - gb.alleles(best); - break; - } - } - - /** - * Subset the samples in VC to reference only information with ref call alleles - * - * Preserves DP if present - * - * @param vc the variant context to subset down to - * @param ploidy ploidy to use if a genotype doesn't have any alleles - * @return a GenotypesContext - */ - public static GenotypesContext subsetToRefOnly(final VariantContext vc, final int ploidy) { - if ( vc == null ) throw new IllegalArgumentException("vc cannot be null"); - if ( ploidy < 1 ) throw new IllegalArgumentException("ploidy must be >= 1 but got " + ploidy); - - // the genotypes with PLs - final GenotypesContext oldGTs = vc.getGenotypes(); - - // optimization: if no input genotypes, just exit - if (oldGTs.isEmpty()) return oldGTs; - - // the new genotypes to create - final GenotypesContext newGTs = GenotypesContext.create(oldGTs.size()); - - final Allele ref = vc.getReference(); - final List diploidRefAlleles = Arrays.asList(ref, ref); - - // create the new genotypes - for ( final Genotype g : vc.getGenotypes() ) { - final int gPloidy = g.getPloidy() == 0 ? ploidy : g.getPloidy(); - final List refAlleles = gPloidy == 2 ? diploidRefAlleles : Collections.nCopies(gPloidy, ref); - final GenotypeBuilder gb = new GenotypeBuilder(g.getSampleName(), refAlleles); - if ( g.hasDP() ) gb.DP(g.getDP()); - if ( g.hasGQ() ) gb.GQ(g.getGQ()); - newGTs.add(gb.make()); - } - - return newGTs; - } - - /** - * Assign genotypes (GTs) to the samples in the Variant Context greedily based on the PLs - * - * @param vc variant context with genotype likelihoods - * @return genotypes context - */ - public static GenotypesContext assignDiploidGenotypes(final VariantContext vc) { - return subsetDiploidAlleles(vc, vc.getAlleles(), GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN); - } - - /** - * Split variant context into its biallelic components if there are more than 2 alleles - * - * For VC has A/B/C alleles, returns A/B and A/C contexts. - * Genotypes are all no-calls now (it's not possible to fix them easily) - * Alleles are right trimmed to satisfy VCF conventions - * - * If vc is biallelic or non-variant it is just returned - * - * Chromosome counts are updated (but they are by definition 0) - * - * @param vc a potentially multi-allelic variant context - * @return a list of bi-allelic (or monomorphic) variant context - */ - public static List splitVariantContextToBiallelics(final VariantContext vc) { - return splitVariantContextToBiallelics(vc, false, GenotypeAssignmentMethod.SET_TO_NO_CALL); - } - - /** - * Split variant context into its biallelic components if there are more than 2 alleles - * - * For VC has A/B/C alleles, returns A/B and A/C contexts. - * Genotypes are all no-calls now (it's not possible to fix them easily) - * Alleles are right trimmed to satisfy VCF conventions - * - * If vc is biallelic or non-variant it is just returned - * - * Chromosome counts are updated (but they are by definition 0) - * - * @param vc a potentially multi-allelic variant context - * @param trimLeft if true, we will also left trim alleles, potentially moving the resulting vcs forward on the genome - * @return a list of bi-allelic (or monomorphic) variant context - */ - public static List splitVariantContextToBiallelics(final VariantContext vc, final boolean trimLeft, final GenotypeAssignmentMethod genotypeAssignmentMethod) { - if ( ! vc.isVariant() || vc.isBiallelic() ) - // non variant or biallelics already satisfy the contract - return Collections.singletonList(vc); - else { - final List biallelics = new LinkedList<>(); - - for ( final Allele alt : vc.getAlternateAlleles() ) { - VariantContextBuilder builder = new VariantContextBuilder(vc); - final List alleles = Arrays.asList(vc.getReference(), alt); - builder.alleles(alleles); - builder.genotypes(subsetDiploidAlleles(vc, alleles, genotypeAssignmentMethod)); - VariantContextUtils.calculateChromosomeCounts(builder, true); - final VariantContext trimmed = trimAlleles(builder.make(), trimLeft, true); - biallelics.add(trimmed); - } - - return biallelics; - } - } - - public static Genotype removePLsAndAD(final Genotype g) { - return ( g.hasLikelihoods() || g.hasAD() ) ? new GenotypeBuilder(g).noPL().noAD().make() : g; - } - - /** - * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. - * If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with - * the sample name - * - * @param unsortedVCs collection of unsorted VCs - * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs - * @param filteredRecordMergeType merge type for filtered records - * @param genotypeMergeOptions merge option for genotypes - * @param annotateOrigin should we annotate the set it came from? - * @param printMessages should we print messages? - * @param setKey the key name of the set - * @param filteredAreUncalled are filtered records uncalled? - * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? - * @return new VariantContext representing the merge of unsortedVCs - */ - public static VariantContext simpleMerge(final Collection unsortedVCs, - final List priorityListOfVCs, - final FilteredRecordMergeType filteredRecordMergeType, - final GenotypeMergeType genotypeMergeOptions, - final boolean annotateOrigin, - final boolean printMessages, - final String setKey, - final boolean filteredAreUncalled, - final boolean mergeInfoWithMaxAC ) { - int originalNumOfVCs = priorityListOfVCs == null ? 0 : priorityListOfVCs.size(); - return simpleMerge(unsortedVCs, priorityListOfVCs, originalNumOfVCs, filteredRecordMergeType, genotypeMergeOptions, annotateOrigin, printMessages, setKey, filteredAreUncalled, mergeInfoWithMaxAC); - } - - /** - * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. - * If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with - * the sample name. - * simpleMerge does not verify any more unique sample names EVEN if genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE. One should use - * SampleUtils.verifyUniqueSamplesNames to check that before using simpleMerge. - * - * For more information on this method see: http://www.thedistractionnetwork.com/programmer-problem/ - * - * @param unsortedVCs collection of unsorted VCs - * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs - * @param filteredRecordMergeType merge type for filtered records - * @param genotypeMergeOptions merge option for genotypes - * @param annotateOrigin should we annotate the set it came from? - * @param printMessages should we print messages? - * @param setKey the key name of the set - * @param filteredAreUncalled are filtered records uncalled? - * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? - * @return new VariantContext representing the merge of unsortedVCs - */ - public static VariantContext simpleMerge(final Collection unsortedVCs, - final List priorityListOfVCs, - final int originalNumOfVCs, - final FilteredRecordMergeType filteredRecordMergeType, - final GenotypeMergeType genotypeMergeOptions, - final boolean annotateOrigin, - final boolean printMessages, - final String setKey, - final boolean filteredAreUncalled, - final boolean mergeInfoWithMaxAC ) { - if ( unsortedVCs == null || unsortedVCs.size() == 0 ) - return null; - - if (priorityListOfVCs != null && originalNumOfVCs != priorityListOfVCs.size()) - throw new IllegalArgumentException("the number of the original VariantContexts must be the same as the number of VariantContexts in the priority list"); - - if ( annotateOrigin && priorityListOfVCs == null && originalNumOfVCs == 0) - throw new IllegalArgumentException("Cannot merge calls and annotate their origins without a complete priority list of VariantContexts or the number of original VariantContexts"); - - final List preFilteredVCs = sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, genotypeMergeOptions); - // Make sure all variant contexts are padded with reference base in case of indels if necessary - List VCs = new ArrayList<>(); - - for (final VariantContext vc : preFilteredVCs) { - if ( ! filteredAreUncalled || vc.isNotFiltered() ) - VCs.add(vc); - } - - if ( VCs.size() == 0 ) // everything is filtered out and we're filteredAreUncalled - return null; - - // establish the baseline info from the first VC - final VariantContext first = VCs.get(0); - final String name = first.getSource(); - final Allele refAllele = determineReferenceAllele(VCs); - - final Set alleles = new LinkedHashSet<>(); - final Set filters = new HashSet<>(); - final Map attributes = new LinkedHashMap<>(); - final Set inconsistentAttributes = new HashSet<>(); - final Set variantSources = new HashSet<>(); // contains the set of sources we found in our set of VCs that are variant - final Set rsIDs = new LinkedHashSet<>(1); // most of the time there's one id - - VariantContext longestVC = first; - int depth = 0; - int maxAC = -1; - final Map attributesWithMaxAC = new LinkedHashMap<>(); - final Map> annotationMap = new LinkedHashMap<>(); - double log10PError = CommonInfo.NO_LOG10_PERROR; - boolean anyVCHadFiltersApplied = false; - VariantContext vcWithMaxAC = null; - GenotypesContext genotypes = GenotypesContext.create(); - - // counting the number of filtered and variant VCs - int nFiltered = 0; - - boolean remapped = false; - - // cycle through and add info from the other VCs, making sure the loc/reference matches - for ( final VariantContext vc : VCs ) { - if ( longestVC.getStart() != vc.getStart() ) - throw new IllegalStateException("BUG: attempting to merge VariantContexts with different start sites: first="+ first.toString() + " second=" + vc.toString()); - - if ( VariantContextUtils.getSize(vc) > VariantContextUtils.getSize(longestVC) ) - longestVC = vc; // get the longest location - - nFiltered += vc.isFiltered() ? 1 : 0; - if ( vc.isVariant() ) variantSources.add(vc.getSource()); - - AlleleMapper alleleMapping = resolveIncompatibleAlleles(refAllele, vc, alleles); - remapped = remapped || alleleMapping.needsRemapping(); - - alleles.addAll(alleleMapping.values()); - - mergeGenotypes(genotypes, vc, alleleMapping, genotypeMergeOptions == GenotypeMergeType.UNIQUIFY); - - // We always take the QUAL of the first VC with a non-MISSING qual for the combined value - if ( log10PError == CommonInfo.NO_LOG10_PERROR ) - log10PError = vc.getLog10PError(); - - filters.addAll(vc.getFilters()); - anyVCHadFiltersApplied |= vc.filtersWereApplied(); - - // - // add attributes - // - // special case DP (add it up) and ID (just preserve it) - // - if (vc.hasAttribute(VCFConstants.DEPTH_KEY)) - depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0); - if ( vc.hasID() ) rsIDs.add(vc.getID()); - if (mergeInfoWithMaxAC && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) { - String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY, null); - // lets see if the string contains a "," separator - if (rawAlleleCounts.contains(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)) { - final List alleleCountArray = Arrays.asList(rawAlleleCounts.substring(1, rawAlleleCounts.length() - 1).split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)); - for (final String alleleCount : alleleCountArray) { - final int ac = Integer.valueOf(alleleCount.trim()); - if (ac > maxAC) { - maxAC = ac; - vcWithMaxAC = vc; - } - } - } else { - final int ac = Integer.valueOf(rawAlleleCounts); - if (ac > maxAC) { - maxAC = ac; - vcWithMaxAC = vc; - } - } - } - - for (final Map.Entry p : vc.getAttributes().entrySet()) { - final String key = p.getKey(); - final Object value = p.getValue(); - // only output annotations that have the same value in every input VC - // if we don't like the key already, don't go anywhere - if ( ! inconsistentAttributes.contains(key) ) { - final boolean alreadyFound = attributes.containsKey(key); - final Object boundValue = attributes.get(key); - final boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4); - - if ( alreadyFound && ! boundValue.equals(value) && ! boundIsMissingValue ) { - // we found the value but we're inconsistent, put it in the exclude list - inconsistentAttributes.add(key); - attributes.remove(key); - } else if ( ! alreadyFound || boundIsMissingValue ) { // no value - attributes.put(key, value); - } - } - } - } - - // if we have more alternate alleles in the merged VC than in one or more of the - // original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF, and AD - for ( final VariantContext vc : VCs ) { - if (vc.getAlleles().size() == 1) - continue; - if ( hasPLIncompatibleAlleles(alleles, vc.getAlleles())) { - if ( ! genotypes.isEmpty() ) { - logger.debug(String.format("Stripping PLs at %s:%d-%d due to incompatible alleles merged=%s vs. single=%s", - vc.getChr(), vc.getStart(), vc.getEnd(), alleles, vc.getAlleles())); - } - genotypes = stripPLsAndAD(genotypes); - // this will remove stale AC,AF attributed from vc - VariantContextUtils.calculateChromosomeCounts(vc, attributes, true); - break; - } - } - - // take the VC with the maxAC and pull the attributes into a modifiable map - if ( mergeInfoWithMaxAC && vcWithMaxAC != null ) { - attributesWithMaxAC.putAll(vcWithMaxAC.getAttributes()); - } - - // if at least one record was unfiltered and we want a union, clear all of the filters - if ( (filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size()) || filteredRecordMergeType == FilteredRecordMergeType.KEEP_UNCONDITIONAL ) - filters.clear(); - - - if ( annotateOrigin ) { // we care about where the call came from - String setValue; - if ( nFiltered == 0 && variantSources.size() == originalNumOfVCs ) // nothing was unfiltered - setValue = MERGE_INTERSECTION; - else if ( nFiltered == VCs.size() ) // everything was filtered out - setValue = MERGE_FILTER_IN_ALL; - else if ( variantSources.isEmpty() ) // everyone was reference - setValue = MERGE_REF_IN_ALL; - else { - final LinkedHashSet s = new LinkedHashSet<>(); - for ( final VariantContext vc : VCs ) - if ( vc.isVariant() ) - s.add( vc.isFiltered() ? MERGE_FILTER_PREFIX + vc.getSource() : vc.getSource() ); - setValue = Utils.join("-", s); - } - - if ( setKey != null ) { - attributes.put(setKey, setValue); - if( mergeInfoWithMaxAC && vcWithMaxAC != null ) { - attributesWithMaxAC.put(setKey, setValue); - } - } - } - - if ( depth > 0 ) - attributes.put(VCFConstants.DEPTH_KEY, String.valueOf(depth)); - - final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs); - - final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID); - builder.loc(longestVC.getChr(), longestVC.getStart(), longestVC.getEnd()); - builder.alleles(alleles); - builder.genotypes(genotypes); - builder.log10PError(log10PError); - if ( anyVCHadFiltersApplied ) { - builder.filters(filters.isEmpty() ? filters : new TreeSet<>(filters)); - } - builder.attributes(new TreeMap<>(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes)); - - // Trim the padded bases of all alleles if necessary - final VariantContext merged = builder.make(); - if ( printMessages && remapped ) System.out.printf("Remapped => %s%n", merged); - return merged; - } - - private static Comparable combineAnnotationValues( final List array ) { - return MathUtils.median(array); // right now we take the median but other options could be explored - } - - /** - * Merges VariantContexts from gVCFs into a single hybrid. - * Assumes that none of the input records are filtered. - * - * @param VCs collection of unsorted genomic VCs - * @param loc the current location - * @param refBase the reference allele to use if all contexts in the VC are spanning (i.e. don't start at the location in loc); if null, we'll return null in this case - * @return new VariantContext representing the merge of all VCs or null if it not relevant - */ - public static VariantContext referenceConfidenceMerge(final List VCs, final GenomeLoc loc, final Byte refBase) { - - if ( VCs == null || VCs.size() == 0 ) throw new IllegalArgumentException("VCs cannot be null or empty"); - - // establish the baseline info (sometimes from the first VC) - final VariantContext first = VCs.get(0); - final String name = first.getSource(); - - // ref allele - final Allele refAllele = determineReferenceAlleleGiveReferenceBase(VCs, loc, refBase); - if ( refAllele == null ) - return null; - - // alt alleles - final AlleleMapper alleleMapper = determineAlternateAlleleMapping(VCs, refAllele, loc); - final List alleles = getAllelesListFromMapper(refAllele, alleleMapper); - - final Map attributes = new LinkedHashMap<>(); - final Set inconsistentAttributes = new HashSet<>(); - final Set rsIDs = new LinkedHashSet<>(1); // most of the time there's one id - - VariantContext longestVC = first; - int depth = 0; - final Map> annotationMap = new LinkedHashMap<>(); - GenotypesContext genotypes = GenotypesContext.create(); - - // cycle through and add info from the other VCs - for ( final VariantContext vc : VCs ) { - - // if this context doesn't start at the current location then it must be a spanning event (deletion or ref block) - final boolean isSpanningEvent = loc.getStart() != vc.getStart(); - final List remappedAlleles = isSpanningEvent ? replaceWithNoCalls(vc.getAlleles()) : alleleMapper.remap(vc.getAlleles()); - mergeRefConfidenceGenotypes(genotypes, vc, remappedAlleles, alleles); - - // special case DP (add it up) for all events - if ( vc.hasAttribute(VCFConstants.DEPTH_KEY) ) - depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0); - - if ( isSpanningEvent ) - continue; - - // keep track of the longest location that starts here - if ( VariantContextUtils.getSize(vc) > VariantContextUtils.getSize(longestVC) ) - longestVC = vc; - - // special case ID (just preserve it) - if ( vc.hasID() ) rsIDs.add(vc.getID()); - - // add attributes - addReferenceConfidenceAttributes(vc.getAttributes(), attributes, inconsistentAttributes, annotationMap); - } - - // when combining annotations use the median value from all input VCs which had annotations provided - for ( final Map.Entry> p : annotationMap.entrySet() ) { - if ( ! p.getValue().isEmpty() ) { - attributes.put(p.getKey(), combineAnnotationValues(p.getValue())); - } - } - - if ( depth > 0 ) - attributes.put(VCFConstants.DEPTH_KEY, String.valueOf(depth)); - - final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs); - - final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID).alleles(alleles) - .loc(longestVC.getChr(), longestVC.getStart(), longestVC.getEnd()) - .genotypes(genotypes).unfiltered().attributes(new TreeMap<>(attributes)).log10PError(CommonInfo.NO_LOG10_PERROR); // we will need to regenotype later - - // remove stale AC and AF based attributes - removeStaleAttributesAfterMerge(builder); - - return builder.make(); - } - - /** - * Determines the ref allele given the provided reference base at this position - * - * @param VCs collection of unsorted genomic VCs - * @param loc the current location - * @param refBase the reference allele to use if all contexts in the VC are spanning - * @return new Allele or null if no reference allele/base is available - */ - private static Allele determineReferenceAlleleGiveReferenceBase(final List VCs, final GenomeLoc loc, final Byte refBase) { - final Allele refAllele = determineReferenceAllele(VCs, loc); - if ( refAllele == null ) - return ( refBase == null ? null : Allele.create(refBase, true) ); - return refAllele; - } - - /** - * Creates an alleles list given a reference allele and a mapper - * - * @param refAllele the reference allele - * @param alleleMapper the allele mapper - * @return a non-null, non-empty list of Alleles - */ - private static List getAllelesListFromMapper(final Allele refAllele, final AlleleMapper alleleMapper) { - final List alleles = new ArrayList<>(); - alleles.add(refAllele); - alleles.addAll(alleleMapper.getUniqueMappedAlleles()); - return alleles; - } - - /** - * Remove the stale attributes from the merged VariantContext (builder) - * - * @param builder the VC builder - */ - private static void removeStaleAttributesAfterMerge(final VariantContextBuilder builder) { - builder.rmAttributes(Arrays.asList(VCFConstants.ALLELE_COUNT_KEY, - VCFConstants.ALLELE_FREQUENCY_KEY, - VCFConstants.ALLELE_NUMBER_KEY, - VCFConstants.MLE_ALLELE_COUNT_KEY, - VCFConstants.MLE_ALLELE_FREQUENCY_KEY)); - } - - /** - * Adds attributes to the global map from the new context in a sophisticated manner - * - * @param myAttributes attributes to add from - * @param globalAttributes global set of attributes to add to - * @param inconsistentAttributes set of attributes that are inconsistent among samples - * @param annotationMap map of annotations for combining later - */ - private static void addReferenceConfidenceAttributes(final Map myAttributes, - final Map globalAttributes, - final Set inconsistentAttributes, - final Map> annotationMap) { - for ( final Map.Entry p : myAttributes.entrySet() ) { - final String key = p.getKey(); - final Object value = p.getValue(); - boolean badAnnotation = false; - - // add the annotation values to a list for combining later - List values = annotationMap.get(key); - if( values == null ) { - values = new ArrayList<>(); - annotationMap.put(key, values); - } - try { - final String stringValue = value.toString(); - // Branch to avoid unintentional, implicit type conversions that occur with the ? operator. - if (stringValue.contains(".")) - values.add(Double.parseDouble(stringValue)); - else - values.add(Integer.parseInt(stringValue)); - } catch (final NumberFormatException e) { - badAnnotation = true; - } - - // only output annotations that have the same value in every input VC - if ( badAnnotation && ! inconsistentAttributes.contains(key) ) { - checkForConsistency(key, value, globalAttributes, inconsistentAttributes); - } - } - } - - /** - * Check attributes for consistency to others in the merge - * - * @param key the attribute key - * @param value the attribute value - * @param globalAttributes the global list of attributes being merged - * @param inconsistentAttributes the list of inconsistent attributes in the merge - */ - private static void checkForConsistency(final String key, - final Object value, - final Map globalAttributes, - final Set inconsistentAttributes) { - final boolean alreadyFound = globalAttributes.containsKey(key); - final Object boundValue = globalAttributes.get(key); - final boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4); - - if ( alreadyFound && ! boundValue.equals(value) && ! boundIsMissingValue ) { - // we found the value but we're inconsistent, put it in the exclude list - inconsistentAttributes.add(key); - globalAttributes.remove(key); - } else if ( ! alreadyFound || boundIsMissingValue ) { // no value - globalAttributes.put(key, value); - } - } - - private static boolean hasPLIncompatibleAlleles(final Collection alleleSet1, final Collection alleleSet2) { - final Iterator it1 = alleleSet1.iterator(); - final Iterator it2 = alleleSet2.iterator(); - - while ( it1.hasNext() && it2.hasNext() ) { - final Allele a1 = it1.next(); - final Allele a2 = it2.next(); - if ( ! a1.equals(a2) ) - return true; - } - - // by this point, at least one of the iterators is empty. All of the elements - // we've compared are equal up until this point. But it's possible that the - // sets aren't the same size, which is indicated by the test below. If they - // are of the same size, though, the sets are compatible - return it1.hasNext() || it2.hasNext(); - } - - public static GenotypesContext stripPLsAndAD(GenotypesContext genotypes) { - final GenotypesContext newGs = GenotypesContext.create(genotypes.size()); - - for ( final Genotype g : genotypes ) { - newGs.add(removePLsAndAD(g)); - } - - return newGs; - } - - /** - * Updates the PLs and AD of the Genotypes in the newly selected VariantContext to reflect the fact that some alleles - * from the original VariantContext are no longer present. - * - * @param selectedVC the selected (new) VariantContext - * @param originalVC the original VariantContext - * @return a new non-null GenotypesContext - */ - public static GenotypesContext updatePLsAndAD(final VariantContext selectedVC, final VariantContext originalVC) { - final int numNewAlleles = selectedVC.getAlleles().size(); - final int numOriginalAlleles = originalVC.getAlleles().size(); - - // if we have more alternate alleles in the selected VC than in the original VC, then something is wrong - if ( numNewAlleles > numOriginalAlleles ) - throw new IllegalArgumentException("Attempting to fix PLs and AD from what appears to be a *combined* VCF and not a selected one"); - - final GenotypesContext oldGs = selectedVC.getGenotypes(); - - // if we have the same number of alternate alleles in the selected VC as in the original VC, then we don't need to fix anything - if ( numNewAlleles == numOriginalAlleles ) - return oldGs; - - final GenotypesContext newGs = fixPLsFromSubsettedAlleles(oldGs, originalVC, selectedVC.getAlleles()); - - return fixADFromSubsettedAlleles(newGs, originalVC, selectedVC.getAlleles()); - } - - /** - * Fix the PLs for the GenotypesContext of a VariantContext that has been subset - * - * @param originalGs the original GenotypesContext - * @param originalVC the original VariantContext - * @param allelesToUse the new (sub)set of alleles to use - * @return a new non-null GenotypesContext - */ - static private GenotypesContext fixPLsFromSubsettedAlleles(final GenotypesContext originalGs, final VariantContext originalVC, final List allelesToUse) { - - // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward - final List likelihoodIndexesToUse = determineLikelihoodIndexesToUse(originalVC, allelesToUse); - - // create the new genotypes - return createGenotypesWithSubsettedLikelihoods(originalGs, originalVC, allelesToUse, likelihoodIndexesToUse, GenotypeAssignmentMethod.DO_NOT_ASSIGN_GENOTYPES); - } - - /** - * Fix the AD for the GenotypesContext of a VariantContext that has been subset - * - * @param originalGs the original GenotypesContext - * @param originalVC the original VariantContext - * @param allelesToUse the new (sub)set of alleles to use - * @return a new non-null GenotypesContext - */ - static private GenotypesContext fixADFromSubsettedAlleles(final GenotypesContext originalGs, final VariantContext originalVC, final List allelesToUse) { - - // the bitset representing the allele indexes we want to keep - final boolean[] alleleIndexesToUse = getAlleleIndexBitset(originalVC, allelesToUse); - - // the new genotypes to create - final GenotypesContext newGTs = GenotypesContext.create(originalGs.size()); - - // the samples - final List sampleIndices = originalGs.getSampleNamesOrderedByName(); - - // create the new genotypes - for ( int k = 0; k < originalGs.size(); k++ ) { - final Genotype g = originalGs.get(sampleIndices.get(k)); - newGTs.add(fixAD(g, alleleIndexesToUse, allelesToUse.size())); - } - - return newGTs; - } - - /** - * Fix the AD for the given Genotype - * - * @param genotype the original Genotype - * @param alleleIndexesToUse a bitset describing whether or not to keep a given index - * @param nAllelesToUse how many alleles we are keeping - * @return a non-null Genotype - */ - private static Genotype fixAD(final Genotype genotype, final boolean[] alleleIndexesToUse, final int nAllelesToUse) { - // if it ain't broke don't fix it - if ( !genotype.hasAD() ) - return genotype; - - final GenotypeBuilder builder = new GenotypeBuilder(genotype); - - final int[] oldAD = genotype.getAD(); - if ( oldAD.length != alleleIndexesToUse.length ) { - builder.noAD(); - } else { - final int[] newAD = new int[nAllelesToUse]; - int currentIndex = 0; - for ( int i = 0; i < oldAD.length; i++ ) { - if ( alleleIndexesToUse[i] ) - newAD[currentIndex++] = oldAD[i]; - } - builder.AD(newAD); - } - return builder.make(); - } - - static private Allele determineReferenceAllele(final List VCs) { - return determineReferenceAllele(VCs, null); - } - - /** - * Determines the common reference allele - * - * @param VCs the list of VariantContexts - * @param loc if not null, ignore records that do not begin at this start location - * @return possibly null Allele - */ - static private Allele determineReferenceAllele(final List VCs, final GenomeLoc loc) { - Allele ref = null; - - for ( final VariantContext vc : VCs ) { - if ( contextMatchesLoc(vc, loc) ) { - final Allele myRef = vc.getReference(); - if ( ref == null || ref.length() < myRef.length() ) - ref = myRef; - else if ( ref.length() == myRef.length() && ! ref.equals(myRef) ) - throw new TribbleException(String.format("The provided variant file(s) have inconsistent references for the same position(s) at %s:%d, %s vs. %s", vc.getChr(), vc.getStart(), ref, myRef)); - } - } - - return ref; - } - - static private boolean contextMatchesLoc(final VariantContext vc, final GenomeLoc loc) { - return loc == null || loc.getStart() == vc.getStart(); - } - - /** - * Given the reference allele, determines the mapping for common alternate alleles in the list of VariantContexts. - * - * @param VCs the list of VariantContexts - * @param refAllele the reference allele - * @param loc if not null, ignore records that do not begin at this start location - * @return non-null AlleleMapper - */ - static private AlleleMapper determineAlternateAlleleMapping(final List VCs, final Allele refAllele, final GenomeLoc loc) { - final Map map = new HashMap<>(); - - for ( final VariantContext vc : VCs ) { - if ( contextMatchesLoc(vc, loc) ) - addAllAlternateAllelesToMap(vc, refAllele, map); - } - - return new AlleleMapper(map); - } - - /** - * Adds all of the alternate alleles from the VariantContext to the allele mapping (for use in creating the AlleleMapper) - * - * @param vc the VariantContext - * @param refAllele the reference allele - * @param map the allele mapping to populate - */ - static private void addAllAlternateAllelesToMap(final VariantContext vc, final Allele refAllele, final Map map) { - // if the ref allele matches, then just add the alts as is - if ( refAllele.equals(vc.getReference()) ) { - for ( final Allele altAllele : vc.getAlternateAlleles() ) { - // ignore symbolic alleles - if ( ! altAllele.isSymbolic() ) - map.put(altAllele, altAllele); - } - } - else { - map.putAll(createAlleleMapping(refAllele, vc, map.values())); - } - } - - static private AlleleMapper resolveIncompatibleAlleles(final Allele refAllele, final VariantContext vc, final Set allAlleles) { - if ( refAllele.equals(vc.getReference()) ) - return new AlleleMapper(vc); - else { - final Map map = createAlleleMapping(refAllele, vc, allAlleles); - map.put(vc.getReference(), refAllele); - return new AlleleMapper(map); - } - } - - /** - * Create an allele mapping for the given context where its reference allele must (potentially) be extended to the given allele - * - * The refAllele is the longest reference allele seen at this start site. - * So imagine it is: - * refAllele: ACGTGA - * myRef: ACGT - * myAlt: A - * - * We need to remap all of the alleles in vc to include the extra GA so that - * myRef => refAllele and myAlt => AGA - * - * @param refAllele the new (extended) reference allele - * @param oneVC the Variant Context to extend - * @param currentAlleles the list of alleles already created - * @return a non-null mapping of original alleles to new (extended) ones - */ - private static Map createAlleleMapping(final Allele refAllele, - final VariantContext oneVC, - final Collection currentAlleles) { - final Allele myRef = oneVC.getReference(); - if ( refAllele.length() <= myRef.length() ) throw new IllegalStateException("BUG: myRef="+myRef+" is longer than refAllele="+refAllele); - - final byte[] extraBases = Arrays.copyOfRange(refAllele.getBases(), myRef.length(), refAllele.length()); - - final Map map = new HashMap<>(); - for ( final Allele a : oneVC.getAlternateAlleles() ) { - if ( isUsableAlternateAllele(a) ) { - Allele extended = Allele.extend(a, extraBases); - for ( final Allele b : currentAlleles ) - if ( extended.equals(b) ) - extended = b; - map.put(a, extended); - } - } - - return map; - } - - static private boolean isUsableAlternateAllele(final Allele allele) { - return ! (allele.isReference() || allele.isSymbolic() ); - } - - public static List sortVariantContextsByPriority(Collection unsortedVCs, List priorityListOfVCs, GenotypeMergeType mergeOption ) { - if ( mergeOption == GenotypeMergeType.PRIORITIZE && priorityListOfVCs == null ) - throw new IllegalArgumentException("Cannot merge calls by priority with a null priority list"); - - if ( priorityListOfVCs == null || mergeOption == GenotypeMergeType.UNSORTED ) - return new ArrayList<>(unsortedVCs); - else { - ArrayList sorted = new ArrayList<>(unsortedVCs); - Collections.sort(sorted, new CompareByPriority(priorityListOfVCs)); - return sorted; - } - } - - private static void mergeGenotypes(GenotypesContext mergedGenotypes, VariantContext oneVC, AlleleMapper alleleMapping, boolean uniquifySamples) { - //TODO: should we add a check for cases when the genotypeMergeOption is REQUIRE_UNIQUE - for ( final Genotype g : oneVC.getGenotypes() ) { - final String name = mergedSampleName(oneVC.getSource(), g.getSampleName(), uniquifySamples); - if ( ! mergedGenotypes.containsSample(name) ) { - // only add if the name is new - Genotype newG = g; - - if ( uniquifySamples || alleleMapping.needsRemapping() ) { - final List alleles = alleleMapping.needsRemapping() ? alleleMapping.remap(g.getAlleles()) : g.getAlleles(); - newG = new GenotypeBuilder(g).name(name).alleles(alleles).make(); - } - - mergedGenotypes.add(newG); - } - } - } - - /** - * Replaces any alleles in the list with NO CALLS, except for the generic ALT allele - * - * @param alleles list of alleles to replace - * @return non-null list of alleles - */ - private static List replaceWithNoCalls(final List alleles) { - if ( alleles == null ) throw new IllegalArgumentException("list of alleles cannot be null"); - - final List result = new ArrayList<>(alleles.size()); - for ( final Allele allele : alleles ) - result.add(allele.equals(NON_REF_SYMBOLIC_ALLELE) ? allele : Allele.NO_CALL); - return result; - } - - /** - * Merge into the context a new genotype represented by the given VariantContext for the provided list of target alleles. - * This method assumes that none of the alleles in the VC overlaps with any of the alleles in the set. - * - * @param mergedGenotypes the genotypes context to add to - * @param VC the Variant Context for the sample - * @param remappedAlleles the list of remapped alleles for the sample - * @param targetAlleles the list of target alleles - */ - private static void mergeRefConfidenceGenotypes(final GenotypesContext mergedGenotypes, - final VariantContext VC, - final List remappedAlleles, - final List targetAlleles) { - for ( final Genotype g : VC.getGenotypes() ) { - - // only add if the name is new - final String name = g.getSampleName(); - if ( !mergedGenotypes.containsSample(name) ) { - // we need to modify it even if it already contains all of the alleles because we need to purge the PLs out anyways - final int[] indexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles); - final int[] PLs = generatePLs(g, indexesOfRelevantAlleles); - // note that we set the alleles to null here (as we expect it to be re-genotyped) - final Genotype newG = new GenotypeBuilder(g).name(name).alleles(null).PL(PLs).noAD().noGQ().make(); - mergedGenotypes.add(newG); - } - } - } - - /** - * Determines the allele mapping from myAlleles to the targetAlleles, substituting the generic "" as appropriate. - * If the myAlleles set does not contain "" as an allele, it throws an exception. - * - * @param remappedAlleles the list of alleles to evaluate - * @param targetAlleles the target list of alleles - * @return non-null array of ints representing indexes - */ - protected static int[] getIndexesOfRelevantAlleles(final List remappedAlleles, final List targetAlleles) { - - if ( remappedAlleles == null || remappedAlleles.size() == 0 ) throw new IllegalArgumentException("The list of input alleles must not be null or empty"); - if ( targetAlleles == null || targetAlleles.size() == 0 ) throw new IllegalArgumentException("The list of target alleles must not be null or empty"); - - if ( !remappedAlleles.contains(NON_REF_SYMBOLIC_ALLELE) ) - throw new IllegalArgumentException("The list of input alleles must contain " + NON_REF_SYMBOLIC_ALLELE + " as an allele; please use the Haplotype Caller with gVCF output to generate appropriate records"); - final int indexOfGenericAlt = remappedAlleles.indexOf(NON_REF_SYMBOLIC_ALLELE); - - final int[] indexMapping = new int[targetAlleles.size()]; - - // the reference alleles always match up (even if they don't appear to) - indexMapping[0] = 0; - - // create the index mapping, using the allele whenever such a mapping doesn't exist - for ( int i = 1; i < targetAlleles.size(); i++ ) { - final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAlleles.get(i)); - indexMapping[i] = indexOfRemappedAllele == -1 ? indexOfGenericAlt: indexOfRemappedAllele; - } - - return indexMapping; - } - - /** - * Generates new PLs given the set of indexes of the Genotype's current alleles from the original PLs. - * Throws an exception if the Genotype does not contain PLs. - * - * @param genotype the genotype from which to grab PLs - * @param indexesOfRelevantAlleles the indexes of the original alleles corresponding to the new alleles - * @return non-null array of new PLs - */ - protected static int[] generatePLs(final Genotype genotype, final int[] indexesOfRelevantAlleles) { - if ( !genotype.hasPL() ) - throw new IllegalArgumentException("Cannot generate new PLs from a genotype without PLs"); - - final int[] originalPLs = genotype.getPL(); - - // assume diploid - final int numLikelihoods = GenotypeLikelihoods.numLikelihoods(indexesOfRelevantAlleles.length, 2); - final int[] newPLs = new int[numLikelihoods]; - - for ( int i = 0; i < indexesOfRelevantAlleles.length; i++ ) { - for ( int j = i; j < indexesOfRelevantAlleles.length; j++ ) { - final int originalPLindex = calculatePLindexFromUnorderedIndexes(indexesOfRelevantAlleles[i], indexesOfRelevantAlleles[j]); - if ( originalPLindex >= originalPLs.length ) - throw new IllegalStateException("The original PLs do not have enough values; accessing index " + originalPLindex + " but size is " + originalPLs.length); - - final int newPLindex = GenotypeLikelihoods.calculatePLindex(i, j); - newPLs[newPLindex] = originalPLs[originalPLindex]; - } - } - - return newPLs; - } - - /** - * This is just a safe wrapper around GenotypeLikelihoods.calculatePLindex() - * - * @param originalIndex1 the index of the first allele - * @param originalIndex2 the index of the second allele - * @return the PL index - */ - protected static int calculatePLindexFromUnorderedIndexes(final int originalIndex1, final int originalIndex2) { - // we need to make sure they are ordered correctly - return ( originalIndex2 < originalIndex1 ) ? GenotypeLikelihoods.calculatePLindex(originalIndex2, originalIndex1) : GenotypeLikelihoods.calculatePLindex(originalIndex1, originalIndex2); - } - - public static String mergedSampleName(String trackName, String sampleName, boolean uniquify ) { - return uniquify ? sampleName + "." + trackName : sampleName; - } - - /** - * Trim the alleles in inputVC from the reverse direction - * - * @param inputVC a non-null input VC whose alleles might need a haircut - * @return a non-null VariantContext (may be == to inputVC) with alleles trimmed up - */ - public static VariantContext reverseTrimAlleles( final VariantContext inputVC ) { - return trimAlleles(inputVC, false, true); - } - - /** - * Trim the alleles in inputVC from the forward direction - * - * @param inputVC a non-null input VC whose alleles might need a haircut - * @return a non-null VariantContext (may be == to inputVC) with alleles trimmed up - */ - public static VariantContext forwardTrimAlleles( final VariantContext inputVC ) { - return trimAlleles(inputVC, true, false); - } - - /** - * Trim the alleles in inputVC forward and reverse, as requested - * - * @param inputVC a non-null input VC whose alleles might need a haircut - * @param trimForward should we trim up the alleles from the forward direction? - * @param trimReverse should we trim up the alleles from the reverse direction? - * @return a non-null VariantContext (may be == to inputVC) with trimmed up alleles - */ - @Ensures("result != null") - public static VariantContext trimAlleles(final VariantContext inputVC, final boolean trimForward, final boolean trimReverse) { - if ( inputVC == null ) throw new IllegalArgumentException("inputVC cannot be null"); - - if ( inputVC.getNAlleles() <= 1 || inputVC.isSNP() ) - return inputVC; - - // see whether we need to trim common reference base from all alleles - final int revTrim = trimReverse ? computeReverseClipping(inputVC.getAlleles(), inputVC.getReference().getDisplayString().getBytes()) : 0; - final VariantContext revTrimVC = trimAlleles(inputVC, -1, revTrim); - final int fwdTrim = trimForward ? computeForwardClipping(revTrimVC.getAlleles()) : -1; - final VariantContext vc= trimAlleles(revTrimVC, fwdTrim, 0); - return vc; - } - - /** - * Trim up alleles in inputVC, cutting out all bases up to fwdTrimEnd inclusive and - * the last revTrim bases from the end - * - * @param inputVC a non-null input VC - * @param fwdTrimEnd bases up to this index (can be -1) will be removed from the start of all alleles - * @param revTrim the last revTrim bases of each allele will be clipped off as well - * @return a non-null VariantContext (may be == to inputVC) with trimmed up alleles - */ - @Requires({"inputVC != null"}) - @Ensures("result != null") - protected static VariantContext trimAlleles(final VariantContext inputVC, - final int fwdTrimEnd, - final int revTrim) { - if( fwdTrimEnd == -1 && revTrim == 0 ) // nothing to do, so just return inputVC unmodified - return inputVC; - - final List alleles = new LinkedList<>(); - final Map originalToTrimmedAlleleMap = new HashMap<>(); - - for (final Allele a : inputVC.getAlleles()) { - if (a.isSymbolic()) { - alleles.add(a); - originalToTrimmedAlleleMap.put(a, a); - } else { - // get bases for current allele and create a new one with trimmed bases - final byte[] newBases = Arrays.copyOfRange(a.getBases(), fwdTrimEnd+1, a.length()-revTrim); - final Allele trimmedAllele = Allele.create(newBases, a.isReference()); - alleles.add(trimmedAllele); - originalToTrimmedAlleleMap.put(a, trimmedAllele); - } - } - - // now we can recreate new genotypes with trimmed alleles - final AlleleMapper alleleMapper = new AlleleMapper(originalToTrimmedAlleleMap); - final GenotypesContext genotypes = updateGenotypesWithMappedAlleles(inputVC.getGenotypes(), alleleMapper); - - final int start = inputVC.getStart() + (fwdTrimEnd + 1); - final VariantContextBuilder builder = new VariantContextBuilder(inputVC); - builder.start(start); - builder.stop(start + alleles.get(0).length() - 1); - builder.alleles(alleles); - builder.genotypes(genotypes); - return builder.make(); - } - - @Requires("originalGenotypes != null && alleleMapper != null") - protected static GenotypesContext updateGenotypesWithMappedAlleles(final GenotypesContext originalGenotypes, final AlleleMapper alleleMapper) { - final GenotypesContext updatedGenotypes = GenotypesContext.create(originalGenotypes.size()); - - for ( final Genotype genotype : originalGenotypes ) { - final List updatedAlleles = alleleMapper.remap(genotype.getAlleles()); - updatedGenotypes.add(new GenotypeBuilder(genotype).alleles(updatedAlleles).make()); - } - - return updatedGenotypes; - } - - public static int computeReverseClipping(final List unclippedAlleles, final byte[] ref) { - int clipping = 0; - boolean stillClipping = true; - - while ( stillClipping ) { - for ( final Allele a : unclippedAlleles ) { - if ( a.isSymbolic() ) - continue; - - // we need to ensure that we don't reverse clip out all of the bases from an allele because we then will have the wrong - // position set for the VariantContext (although it's okay to forward clip it all out, because the position will be fine). - if ( a.length() - clipping == 0 ) - return clipping - 1; - - if ( a.length() - clipping <= 0 || a.length() == 0 ) { - stillClipping = false; - } - else if ( ref.length == clipping ) { - return -1; - } - else if ( a.getBases()[a.length()-clipping-1] != ref[ref.length-clipping-1] ) { - stillClipping = false; - } - } - if ( stillClipping ) - clipping++; - } - - return clipping; - } - - /** - * Clip out any unnecessary bases off the front of the alleles - * - * The VCF spec represents alleles as block substitutions, replacing AC with A for a - * 1 bp deletion of the C. However, it's possible that we'd end up with alleles that - * contain extra bases on the left, such as GAC/GA to represent the same 1 bp deletion. - * This routine finds an offset among all alleles that can be safely trimmed - * off the left of each allele and still represent the same block substitution. - * - * A/C => A/C - * AC/A => AC/A - * ACC/AC => CC/C - * AGT/CAT => AGT/CAT - * /C => /C - * - * @param unclippedAlleles a non-null list of alleles that we want to clip - * @return the offset into the alleles where we can safely clip, inclusive, or - * -1 if no clipping is tolerated. So, if the result is 0, then we can remove - * the first base of every allele. If the result is 1, we can remove the - * second base. - */ - public static int computeForwardClipping(final List unclippedAlleles) { - // cannot clip unless there's at least 1 alt allele - if ( unclippedAlleles.size() <= 1 ) - return -1; - - // we cannot forward clip any set of alleles containing a symbolic allele - int minAlleleLength = Integer.MAX_VALUE; - for ( final Allele a : unclippedAlleles ) { - if ( a.isSymbolic() ) - return -1; - minAlleleLength = Math.min(minAlleleLength, a.length()); - } - - final byte[] firstAlleleBases = unclippedAlleles.get(0).getBases(); - int indexOflastSharedBase = -1; - - // the -1 to the stop is that we can never clip off the right most base - for ( int i = 0; i < minAlleleLength - 1; i++) { - final byte base = firstAlleleBases[i]; - - for ( final Allele allele : unclippedAlleles ) { - if ( allele.getBases()[i] != base ) - return indexOflastSharedBase; - } - - indexOflastSharedBase = i; - } - - return indexOflastSharedBase; - } - - public static double computeHardyWeinbergPvalue(VariantContext vc) { - if ( vc.getCalledChrCount() == 0 ) - return 0.0; - return HardyWeinbergCalculation.hwCalculate(vc.getHomRefCount(), vc.getHetCount(), vc.getHomVarCount()); - } - - public static boolean requiresPaddingBase(final List alleles) { - - // see whether one of the alleles would be null if trimmed through - - for ( final String allele : alleles ) { - if ( allele.isEmpty() ) - return true; - } - - int clipping = 0; - Character currentBase = null; - - while ( true ) { - for ( final String allele : alleles ) { - if ( allele.length() - clipping == 0 ) - return true; - - char myBase = allele.charAt(clipping); - if ( currentBase == null ) - currentBase = myBase; - else if ( currentBase != myBase ) - return false; - } - - clipping++; - currentBase = null; - } - } - - private final static Map subsetAttributes(final CommonInfo igc, final Collection keysToPreserve) { - Map attributes = new HashMap<>(keysToPreserve.size()); - for ( final String key : keysToPreserve ) { - if ( igc.hasAttribute(key) ) - attributes.put(key, igc.getAttribute(key)); - } - return attributes; - } - - /** - * @deprecated use variant context builder version instead - * @param vc the variant context - * @param keysToPreserve the keys to preserve - * @return a pruned version of the original variant context - */ - @Deprecated - public static VariantContext pruneVariantContext(final VariantContext vc, Collection keysToPreserve ) { - return pruneVariantContext(new VariantContextBuilder(vc), keysToPreserve).make(); - } - - public static VariantContextBuilder pruneVariantContext(final VariantContextBuilder builder, Collection keysToPreserve ) { - final VariantContext vc = builder.make(); - if ( keysToPreserve == null ) keysToPreserve = Collections.emptyList(); - - // VC info - final Map attributes = subsetAttributes(vc.getCommonInfo(), keysToPreserve); - - // Genotypes - final GenotypesContext genotypes = GenotypesContext.create(vc.getNSamples()); - for ( final Genotype g : vc.getGenotypes() ) { - final GenotypeBuilder gb = new GenotypeBuilder(g); - // remove AD, DP, PL, and all extended attributes, keeping just GT and GQ - gb.noAD().noDP().noPL().noAttributes(); - genotypes.add(gb.make()); - } - - return builder.genotypes(genotypes).attributes(attributes); - } - - public static boolean allelesAreSubset(VariantContext vc1, VariantContext vc2) { - // if all alleles of vc1 are a contained in alleles of vc2, return true - if (!vc1.getReference().equals(vc2.getReference())) - return false; - - for (final Allele a :vc1.getAlternateAlleles()) { - if (!vc2.getAlternateAlleles().contains(a)) - return false; - } - - return true; - } - - public static Map> separateVariantContextsByType( final Collection VCs ) { - if( VCs == null ) { throw new IllegalArgumentException("VCs cannot be null."); } - - final HashMap> mappedVCs = new HashMap<>(); - for ( final VariantContext vc : VCs ) { - VariantContext.Type vcType = vc.getType(); - if( vc.hasAllele(NON_REF_SYMBOLIC_ALLELE) ) { - if( vc.getAlternateAlleles().size() > 1 ) { throw new IllegalStateException("Reference records should not have more than one alternate allele"); } - vcType = VariantContext.Type.NO_VARIATION; - } - - // look at previous variant contexts of different type. If: - // a) otherVC has alleles which are subset of vc, remove otherVC from its list and add otherVC to vc's list - // b) vc has alleles which are subset of otherVC. Then, add vc to otherVC's type list (rather, do nothing since vc will be added automatically to its list) - // c) neither: do nothing, just add vc to its own list - boolean addtoOwnList = true; - for (final VariantContext.Type type : VariantContext.Type.values()) { - if (type.equals(vcType)) - continue; - - if (!mappedVCs.containsKey(type)) - continue; - - List vcList = mappedVCs.get(type); - for (int k=0; k < vcList.size(); k++) { - VariantContext otherVC = vcList.get(k); - if (allelesAreSubset(otherVC,vc)) { - // otherVC has a type different than vc and its alleles are a subset of vc: remove otherVC from its list and add it to vc's type list - vcList.remove(k); - // avoid having empty lists - if (vcList.size() == 0) - mappedVCs.remove(type); - if ( !mappedVCs.containsKey(vcType) ) - mappedVCs.put(vcType, new ArrayList()); - mappedVCs.get(vcType).add(otherVC); - break; - } - else if (allelesAreSubset(vc,otherVC)) { - // vc has a type different than otherVC and its alleles are a subset of VC: add vc to otherVC's type list and don't add to its own - mappedVCs.get(type).add(vc); - addtoOwnList = false; - break; - } - } - } - if (addtoOwnList) { - if ( !mappedVCs.containsKey(vcType) ) - mappedVCs.put(vcType, new ArrayList()); - mappedVCs.get(vcType).add(vc); - } - } - - return mappedVCs; - } - - public static VariantContext purgeUnallowedGenotypeAttributes(VariantContext vc, Set allowedAttributes) { - if ( allowedAttributes == null ) - return vc; - - final GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); - for ( final Genotype genotype : vc.getGenotypes() ) { - final Map attrs = new HashMap<>(); - for ( final Map.Entry attr : genotype.getExtendedAttributes().entrySet() ) { - if ( allowedAttributes.contains(attr.getKey()) ) - attrs.put(attr.getKey(), attr.getValue()); - } - newGenotypes.add(new GenotypeBuilder(genotype).attributes(attrs).make()); - } - - return new VariantContextBuilder(vc).genotypes(newGenotypes).make(); - } - - - protected static class AlleleMapper { - private VariantContext vc = null; - private Map map = null; - public AlleleMapper(VariantContext vc) { this.vc = vc; } - public AlleleMapper(Map map) { this.map = map; } - public boolean needsRemapping() { return this.map != null; } - public Collection values() { return map != null ? map.values() : vc.getAlleles(); } - public Allele remap(Allele a) { return map != null && map.containsKey(a) ? map.get(a) : a; } - - public List remap(List as) { - List newAs = new ArrayList<>(); - for ( final Allele a : as ) { - //System.out.printf(" Remapping %s => %s%n", a, remap(a)); - newAs.add(remap(a)); - } - return newAs; - } - - /** - * @return the list of unique values - */ - public List getUniqueMappedAlleles() { - if ( map == null ) - return Collections.emptyList(); - return new ArrayList<>(new HashSet<>(map.values())); - } - } - - private static class CompareByPriority implements Comparator, Serializable { - List priorityListOfVCs; - public CompareByPriority(List priorityListOfVCs) { - this.priorityListOfVCs = priorityListOfVCs; - } - - private int getIndex(VariantContext vc) { - int i = priorityListOfVCs.indexOf(vc.getSource()); - if ( i == -1 ) throw new IllegalArgumentException("Priority list " + priorityListOfVCs + " doesn't contain variant context " + vc.getSource()); - return i; - } - - public int compare(VariantContext vc1, VariantContext vc2) { - return Integer.valueOf(getIndex(vc1)).compareTo(getIndex(vc2)); - } - } - - /** - * For testing purposes only. Create a site-only VariantContext at contig:start containing alleles - * - * @param name the name of the VC - * @param contig the contig for the VC - * @param start the start of the VC - * @param alleleStrings a non-null, non-empty list of strings for the alleles. The first will be the ref allele, and others the - * alt. Will compute the stop of the VC from the length of the reference allele - * @return a non-null VariantContext - */ - public static VariantContext makeFromAlleles(final String name, final String contig, final int start, final List alleleStrings) { - if ( alleleStrings == null || alleleStrings.isEmpty() ) - throw new IllegalArgumentException("alleleStrings must be non-empty, non-null list"); - - final List alleles = new LinkedList<>(); - final int length = alleleStrings.get(0).length(); - - boolean first = true; - for ( final String alleleString : alleleStrings ) { - alleles.add(Allele.create(alleleString, first)); - first = false; - } - return new VariantContextBuilder(name, contig, start, start+length-1, alleles).make(); - } - - /** - * Splits the alleles for the provided variant context into its primitive parts. - * Requires that the input VC be bi-allelic, so calling methods should first call splitVariantContextToBiallelics() if needed. - * Currently works only for MNPs. - * - * @param vc the non-null VC to split - * @return a non-empty list of VCs split into primitive parts or the original VC otherwise - */ - public static List splitIntoPrimitiveAlleles(final VariantContext vc) { - if ( vc == null ) - throw new IllegalArgumentException("Trying to break a null Variant Context into primitive parts"); - - if ( !vc.isBiallelic() ) - throw new IllegalArgumentException("Trying to break a multi-allelic Variant Context into primitive parts"); - - // currently only works for MNPs - if ( !vc.isMNP() ) - return Arrays.asList(vc); - - final byte[] ref = vc.getReference().getBases(); - final byte[] alt = vc.getAlternateAllele(0).getBases(); - - if ( ref.length != alt.length ) - throw new IllegalStateException("ref and alt alleles for MNP have different lengths"); - - final List result = new ArrayList<>(ref.length); - - for ( int i = 0; i < ref.length; i++ ) { - - // if the ref and alt bases are different at a given position, create a new SNP record (otherwise do nothing) - if ( ref[i] != alt[i] ) { - - // create the ref and alt SNP alleles - final Allele newRefAllele = Allele.create(ref[i], true); - final Allele newAltAllele = Allele.create(alt[i], false); - - // create a new VariantContext with the new SNP alleles - final VariantContextBuilder newVC = new VariantContextBuilder(vc).start(vc.getStart() + i).stop(vc.getStart() + i).alleles(Arrays.asList(newRefAllele, newAltAllele)); - - // create new genotypes with updated alleles - final Map alleleMap = new HashMap<>(); - alleleMap.put(vc.getReference(), newRefAllele); - alleleMap.put(vc.getAlternateAllele(0), newAltAllele); - final GenotypesContext newGenotypes = updateGenotypesWithMappedAlleles(vc.getGenotypes(), new AlleleMapper(alleleMap)); - - result.add(newVC.genotypes(newGenotypes).make()); - } - } - - if ( result.isEmpty() ) - result.add(vc); - - return result; - } - - /** - * Are vc1 and 2 equal including their position and alleles? - * @param vc1 non-null VariantContext - * @param vc2 non-null VariantContext - * @return true if vc1 and vc2 are equal, false otherwise - */ - public static boolean equalSites(final VariantContext vc1, final VariantContext vc2) { - if ( vc1 == null ) throw new IllegalArgumentException("vc1 cannot be null"); - if ( vc2 == null ) throw new IllegalArgumentException("vc2 cannot be null"); - - if ( vc1.getStart() != vc2.getStart() ) return false; - if ( vc1.getEnd() != vc2.getEnd() ) return false; - if ( ! vc1.getChr().equals(vc2.getChr())) return false; - if ( ! vc1.getAlleles().equals(vc2.getAlleles()) ) return false; - return true; - } -} diff --git a/public/java/test/org/broadinstitute/sting/BaseTest.java b/public/java/test/org/broadinstitute/sting/BaseTest.java deleted file mode 100644 index c1e11e2ce..000000000 --- a/public/java/test/org/broadinstitute/sting/BaseTest.java +++ /dev/null @@ -1,513 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting; - -import org.apache.log4j.AppenderSkeleton; -import org.apache.log4j.Level; -import org.apache.log4j.Logger; -import org.apache.log4j.PatternLayout; -import org.apache.log4j.spi.LoggingEvent; -import org.broad.tribble.readers.LineIterator; -import org.broad.tribble.readers.PositionalBufferedStream; -import org.broadinstitute.sting.commandline.CommandLineUtils; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.crypt.CryptUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.io.IOUtils; -import org.broadinstitute.sting.utils.variant.GATKVCFUtils; -import org.broadinstitute.variant.bcf2.BCF2Codec; -import org.broadinstitute.variant.variantcontext.Genotype; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.vcf.VCFCodec; -import org.broadinstitute.variant.vcf.VCFConstants; -import org.broadinstitute.variant.vcf.VCFHeader; -import org.broadinstitute.variant.vcf.VCFHeaderLine; -import org.testng.Assert; -import org.testng.Reporter; -import org.testng.SkipException; - -import java.io.File; -import java.io.IOException; -import java.util.*; - -/** - * - * User: aaron - * Date: Apr 14, 2009 - * Time: 10:24:30 AM - * - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - * - */ - - -/** - * @author aaron - * @version 1.0 - * @date Apr 14, 2009 - *

- * Class BaseTest - *

- * This is the base test class for all of our test cases. All test cases should extend from this - * class; it sets up the logger, and resolves the location of directories that we rely on. - */ -@SuppressWarnings("unchecked") -public abstract class BaseTest { - /** our log, which we want to capture anything from org.broadinstitute.sting */ - public static final Logger logger = CommandLineUtils.getStingLogger(); - - public static final String hg18Reference = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"; - public static final String hg19Reference = "/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta"; - public static final String b36KGReference = "/humgen/1kg/reference/human_b36_both.fasta"; - //public static final String b37KGReference = "/Users/depristo/Desktop/broadLocal/localData/human_g1k_v37.fasta"; - public static final String b37KGReference = "/humgen/1kg/reference/human_g1k_v37.fasta"; - public static final String b37KGReferenceWithDecoy = "/humgen/gsa-hpprojects/GATK/bundle/current/b37/human_g1k_v37_decoy.fasta"; - public static final String GATKDataLocation = "/humgen/gsa-hpprojects/GATK/data/"; - public static final String validationDataLocation = GATKDataLocation + "Validation_Data/"; - public static final String evaluationDataLocation = GATKDataLocation + "Evaluation_Data/"; - public static final String comparisonDataLocation = GATKDataLocation + "Comparisons/"; - public static final String annotationDataLocation = GATKDataLocation + "Annotations/"; - - public static final String b37GoodBAM = validationDataLocation + "/CEUTrio.HiSeq.b37.chr20.10_11mb.bam"; - public static final String b37GoodNA12878BAM = validationDataLocation + "/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam"; - public static final String b37_NA12878_OMNI = validationDataLocation + "/NA12878.omni.vcf"; - - public static final String dbsnpDataLocation = GATKDataLocation; - public static final String b36dbSNP129 = dbsnpDataLocation + "dbsnp_129_b36.vcf"; - public static final String b37dbSNP129 = dbsnpDataLocation + "dbsnp_129_b37.vcf"; - public static final String b37dbSNP132 = dbsnpDataLocation + "dbsnp_132_b37.vcf"; - public static final String hg18dbSNP132 = dbsnpDataLocation + "dbsnp_132.hg18.vcf"; - - public static final String hapmapDataLocation = comparisonDataLocation + "Validated/HapMap/3.3/"; - public static final String b37hapmapGenotypes = hapmapDataLocation + "genotypes_r27_nr.b37_fwd.vcf"; - public static final String b37hapmapSites = hapmapDataLocation + "sites_r27_nr.b37_fwd.vcf"; - - public static final String intervalsLocation = GATKDataLocation; - public static final String hg19Intervals = intervalsLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list"; - public static final String hg19Chr20Intervals = intervalsLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.chr20.interval_list"; - - public static final boolean REQUIRE_NETWORK_CONNECTION = false; - private static final String networkTempDirRoot = "/broad/hptmp/"; - private static final boolean networkTempDirRootExists = new File(networkTempDirRoot).exists(); - private static final String networkTempDir; - private static final File networkTempDirFile; - - private static final String privateTestDirRelative = "private/testdata/"; - public static final String privateTestDir = new File(privateTestDirRelative).getAbsolutePath() + "/"; - protected static final String privateTestDirRoot = privateTestDir.replace(privateTestDirRelative, ""); - - private static final String publicTestDirRelative = "public/testdata/"; - public static final String publicTestDir = new File(publicTestDirRelative).getAbsolutePath() + "/"; - protected static final String publicTestDirRoot = publicTestDir.replace(publicTestDirRelative, ""); - - public static final String keysDataLocation = validationDataLocation + "keys/"; - public static final String gatkKeyFile = CryptUtils.GATK_USER_KEY_DIRECTORY + "gsamembers_broadinstitute.org.key"; - - public static final String exampleFASTA = publicTestDir + "exampleFASTA.fasta"; - - public final static String NA12878_PCRFREE = privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam"; - public final static String NA12878_WEx = privateTestDir + "CEUTrio.HiSeq.WEx.b37_decoy.NA12878.20_10_11mb.bam"; - - public static final boolean pipelineTestRunModeIsSet = System.getProperty("pipeline.run").equals("run"); - - /** before the class starts up */ - static { - // setup a basic log configuration - CommandLineUtils.configureConsoleLogging(); - - // setup our log layout - PatternLayout layout = new PatternLayout(); - layout.setConversionPattern("TEST %C{1}.%M - %d{HH:mm:ss,SSS} - %m%n"); - - // now set the layout of all the loggers to our layout - CommandLineUtils.setLayout(logger, layout); - - // Set the Root logger to only output warnings. - logger.setLevel(Level.WARN); - - if (networkTempDirRootExists) { - networkTempDirFile = IOUtils.tempDir("temp.", ".dir", new File(networkTempDirRoot + System.getProperty("user.name"))); - networkTempDirFile.deleteOnExit(); - networkTempDir = networkTempDirFile.getAbsolutePath() + "/"; - } else { - networkTempDir = null; - networkTempDirFile = null; - } - - - if ( REQUIRE_NETWORK_CONNECTION ) { - // find our file sources - if (!fileExist(hg18Reference) || !fileExist(hg19Reference) || !fileExist(b36KGReference)) { - logger.fatal("We can't locate the reference directories. Aborting!"); - throw new RuntimeException("BaseTest setup failed: unable to locate the reference directories"); - } - } - } - - /** - * Simple generic utility class to creating TestNG data providers: - * - * 1: inherit this class, as in - * - * private class SummarizeDifferenceTest extends TestDataProvider { - * public SummarizeDifferenceTest() { - * super(SummarizeDifferenceTest.class); - * } - * ... - * } - * - * Provide a reference to your class to the TestDataProvider constructor. - * - * 2: Create instances of your subclass. Return from it the call to getTests, providing - * the class type of your test - * - * @DataProvider(name = "summaries" - * public Object[][] createSummaries() { - * new SummarizeDifferenceTest().addDiff("A", "A").addSummary("A:2"); - * new SummarizeDifferenceTest().addDiff("A", "B").addSummary("A:1", "B:1"); - * return SummarizeDifferenceTest.getTests(SummarizeDifferenceTest.class); - * } - * - * This class magically tracks created objects of this - */ - public static class TestDataProvider { - private static final Map> tests = new HashMap>(); - protected String name; - - /** - * Create a new TestDataProvider instance bound to the class variable C - * @param c - */ - public TestDataProvider(Class c, String name) { - if ( ! tests.containsKey(c) ) - tests.put(c, new ArrayList()); - tests.get(c).add(this); - this.name = name; - } - - public TestDataProvider(Class c) { - this(c, ""); - } - - public void setName(final String name) { - this.name = name; - } - - /** - * Return all of the data providers in the form expected by TestNG of type class C - * @param c - * @return - */ - public static Object[][] getTests(Class c) { - List params2 = new ArrayList(); - for ( Object x : tests.get(c) ) params2.add(new Object[]{x}); - return params2.toArray(new Object[][]{}); - } - - @Override - public String toString() { - return "TestDataProvider("+name+")"; - } - } - - /** - * test if the file exists - * - * @param file name as a string - * @return true if it exists - */ - public static boolean fileExist(String file) { - File temp = new File(file); - return temp.exists(); - } - - /** - * this appender looks for a specific message in the log4j stream. - * It can be used to verify that a specific message was generated to the logging system. - */ - public static class ValidationAppender extends AppenderSkeleton { - - private boolean foundString = false; - private String targetString = ""; - - public ValidationAppender(String target) { - targetString = target; - } - - @Override - protected void append(LoggingEvent loggingEvent) { - if (loggingEvent.getMessage().equals(targetString)) - foundString = true; - } - - public void close() { - // do nothing - } - - public boolean requiresLayout() { - return false; - } - - public boolean foundString() { - return foundString; - } - } - - /** - * Creates a temp file that will be deleted on exit after tests are complete. - * @param name Prefix of the file. - * @param extension Extension to concat to the end of the file. - * @return A file in the temporary directory starting with name, ending with extension, which will be deleted after the program exits. - */ - public static File createTempFile(String name, String extension) { - try { - File file = File.createTempFile(name, extension); - file.deleteOnExit(); - return file; - } catch (IOException ex) { - throw new ReviewedStingException("Cannot create temp file: " + ex.getMessage(), ex); - } - } - - /** - * Creates a temp file that will be deleted on exit after tests are complete. - * @param name Name of the file. - * @return A file in the network temporary directory with name, which will be deleted after the program exits. - * @throws SkipException when the network is not available. - */ - public static File tryCreateNetworkTempFile(String name) { - if (!networkTempDirRootExists) - throw new SkipException("Network temporary directory does not exist: " + networkTempDirRoot); - File file = new File(networkTempDirFile, name); - file.deleteOnExit(); - return file; - } - - /** - * Log this message so that it shows up inline during output as well as in html reports - * - * @param message - */ - public static void log(final String message) { - Reporter.log(message, true); - } - - private static final double DEFAULT_FLOAT_TOLERANCE = 1e-1; - - public static final void assertEqualsDoubleSmart(final Object actual, final Double expected) { - Assert.assertTrue(actual instanceof Double, "Not a double"); - assertEqualsDoubleSmart((double)(Double)actual, (double)expected); - } - - public static final void assertEqualsDoubleSmart(final Object actual, final Double expected, final double tolerance) { - Assert.assertTrue(actual instanceof Double, "Not a double"); - assertEqualsDoubleSmart((double)(Double)actual, (double)expected, tolerance); - } - - public static final void assertEqualsDoubleSmart(final double actual, final double expected) { - assertEqualsDoubleSmart(actual, expected, DEFAULT_FLOAT_TOLERANCE); - } - - public static final void assertEqualsSet(final Set actual, final Set expected, final String info) { - final Set actualSet = new HashSet(actual); - final Set expectedSet = new HashSet(expected); - Assert.assertTrue(actualSet.equals(expectedSet), info); // note this is necessary due to testng bug for set comps - } - - public static void assertEqualsDoubleSmart(final double actual, final double expected, final double tolerance) { - assertEqualsDoubleSmart(actual, expected, tolerance, null); - } - - public static void assertEqualsDoubleSmart(final double actual, final double expected, final double tolerance, final String message) { - if ( Double.isNaN(expected) ) // NaN == NaN => false unfortunately - Assert.assertTrue(Double.isNaN(actual), "expected is nan, actual is not"); - else if ( Double.isInfinite(expected) ) // NaN == NaN => false unfortunately - Assert.assertTrue(Double.isInfinite(actual), "expected is infinite, actual is not"); - else { - final double delta = Math.abs(actual - expected); - final double ratio = Math.abs(actual / expected - 1.0); - Assert.assertTrue(delta < tolerance || ratio < tolerance, "expected = " + expected + " actual = " + actual - + " not within tolerance " + tolerance - + (message == null ? "" : "message: " + message)); - } - } - - public static void assertVariantContextsAreEqual( final VariantContext actual, final VariantContext expected ) { - Assert.assertNotNull(actual, "VariantContext expected not null"); - Assert.assertEquals(actual.getChr(), expected.getChr(), "chr"); - Assert.assertEquals(actual.getStart(), expected.getStart(), "start"); - Assert.assertEquals(actual.getEnd(), expected.getEnd(), "end"); - Assert.assertEquals(actual.getID(), expected.getID(), "id"); - Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "alleles for " + expected + " vs " + actual); - - assertAttributesEquals(actual.getAttributes(), expected.getAttributes()); - Assert.assertEquals(actual.filtersWereApplied(), expected.filtersWereApplied(), "filtersWereApplied"); - Assert.assertEquals(actual.isFiltered(), expected.isFiltered(), "isFiltered"); - assertEqualsSet(actual.getFilters(), expected.getFilters(), "filters"); - assertEqualsDoubleSmart(actual.getPhredScaledQual(), expected.getPhredScaledQual()); - - Assert.assertEquals(actual.hasGenotypes(), expected.hasGenotypes(), "hasGenotypes"); - if ( expected.hasGenotypes() ) { - assertEqualsSet(actual.getSampleNames(), expected.getSampleNames(), "sample names set"); - Assert.assertEquals(actual.getSampleNamesOrderedByName(), expected.getSampleNamesOrderedByName(), "sample names"); - final Set samples = expected.getSampleNames(); - for ( final String sample : samples ) { - assertGenotypesAreEqual(actual.getGenotype(sample), expected.getGenotype(sample)); - } - } - } - - public static void assertVariantContextStreamsAreEqual(final Iterable actual, final Iterable expected) { - final Iterator actualIT = actual.iterator(); - final Iterator expectedIT = expected.iterator(); - - while ( expectedIT.hasNext() ) { - final VariantContext expectedVC = expectedIT.next(); - if ( expectedVC == null ) - continue; - - VariantContext actualVC; - do { - Assert.assertTrue(actualIT.hasNext(), "Too few records found in actual"); - actualVC = actualIT.next(); - } while ( actualIT.hasNext() && actualVC == null ); - - if ( actualVC == null ) - Assert.fail("Too few records in actual"); - - assertVariantContextsAreEqual(actualVC, expectedVC); - } - Assert.assertTrue(! actualIT.hasNext(), "Too many records found in actual"); - } - - - public static void assertGenotypesAreEqual(final Genotype actual, final Genotype expected) { - Assert.assertEquals(actual.getSampleName(), expected.getSampleName(), "Genotype names"); - Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "Genotype alleles"); - Assert.assertEquals(actual.getGenotypeString(), expected.getGenotypeString(), "Genotype string"); - Assert.assertEquals(actual.getType(), expected.getType(), "Genotype type"); - - // filters are the same - Assert.assertEquals(actual.getFilters(), expected.getFilters(), "Genotype fields"); - Assert.assertEquals(actual.isFiltered(), expected.isFiltered(), "Genotype isFiltered"); - - // inline attributes - Assert.assertEquals(actual.getDP(), expected.getDP(), "Genotype dp"); - Assert.assertTrue(Arrays.equals(actual.getAD(), expected.getAD())); - Assert.assertEquals(actual.getGQ(), expected.getGQ(), "Genotype gq"); - Assert.assertEquals(actual.hasPL(), expected.hasPL(), "Genotype hasPL"); - Assert.assertEquals(actual.hasAD(), expected.hasAD(), "Genotype hasAD"); - Assert.assertEquals(actual.hasGQ(), expected.hasGQ(), "Genotype hasGQ"); - Assert.assertEquals(actual.hasDP(), expected.hasDP(), "Genotype hasDP"); - - Assert.assertEquals(actual.hasLikelihoods(), expected.hasLikelihoods(), "Genotype haslikelihoods"); - Assert.assertEquals(actual.getLikelihoodsString(), expected.getLikelihoodsString(), "Genotype getlikelihoodsString"); - Assert.assertEquals(actual.getLikelihoods(), expected.getLikelihoods(), "Genotype getLikelihoods"); - Assert.assertTrue(Arrays.equals(actual.getPL(), expected.getPL())); - - Assert.assertEquals(actual.getPhredScaledQual(), expected.getPhredScaledQual(), "Genotype phredScaledQual"); - assertAttributesEquals(actual.getExtendedAttributes(), expected.getExtendedAttributes()); - Assert.assertEquals(actual.isPhased(), expected.isPhased(), "Genotype isPhased"); - Assert.assertEquals(actual.getPloidy(), expected.getPloidy(), "Genotype getPloidy"); - } - - public static void assertVCFHeadersAreEqual(final VCFHeader actual, final VCFHeader expected) { - Assert.assertEquals(actual.getMetaDataInSortedOrder().size(), expected.getMetaDataInSortedOrder().size(), "No VCF header lines"); - - // for some reason set.equals() is returning false but all paired elements are .equals(). Perhaps compare to is busted? - //Assert.assertEquals(actual.getMetaDataInInputOrder(), expected.getMetaDataInInputOrder()); - final List actualLines = new ArrayList(actual.getMetaDataInSortedOrder()); - final List expectedLines = new ArrayList(expected.getMetaDataInSortedOrder()); - for ( int i = 0; i < actualLines.size(); i++ ) { - Assert.assertEquals(actualLines.get(i), expectedLines.get(i), "VCF header lines"); - } - } - - public static void assertVCFandBCFFilesAreTheSame(final File vcfFile, final File bcfFile) throws IOException { - final Pair> vcfData = GATKVCFUtils.readAllVCs(vcfFile, new VCFCodec()); - final Pair> bcfData = GATKVCFUtils.readAllVCs(bcfFile, new BCF2Codec()); - assertVCFHeadersAreEqual(bcfData.getFirst(), vcfData.getFirst()); - assertVariantContextStreamsAreEqual(bcfData.getSecond(), vcfData.getSecond()); - } - - private static void assertAttributeEquals(final String key, final Object actual, final Object expected) { - if ( expected instanceof Double ) { - // must be very tolerant because doubles are being rounded to 2 sig figs - assertEqualsDoubleSmart(actual, (Double) expected, 1e-2); - } else - Assert.assertEquals(actual, expected, "Attribute " + key); - } - - private static void assertAttributesEquals(final Map actual, Map expected) { - final Set expectedKeys = new HashSet(expected.keySet()); - - for ( final Map.Entry act : actual.entrySet() ) { - final Object actualValue = act.getValue(); - if ( expected.containsKey(act.getKey()) && expected.get(act.getKey()) != null ) { - final Object expectedValue = expected.get(act.getKey()); - if ( expectedValue instanceof List ) { - final List expectedList = (List)expectedValue; - Assert.assertTrue(actualValue instanceof List, act.getKey() + " should be a list but isn't"); - final List actualList = (List)actualValue; - Assert.assertEquals(actualList.size(), expectedList.size(), act.getKey() + " size"); - for ( int i = 0; i < expectedList.size(); i++ ) - assertAttributeEquals(act.getKey(), actualList.get(i), expectedList.get(i)); - } else - assertAttributeEquals(act.getKey(), actualValue, expectedValue); - } else { - // it's ok to have a binding in x -> null that's absent in y - Assert.assertNull(actualValue, act.getKey() + " present in one but not in the other"); - } - expectedKeys.remove(act.getKey()); - } - - // now expectedKeys contains only the keys found in expected but not in actual, - // and they must all be null - for ( final String missingExpected : expectedKeys ) { - final Object value = expected.get(missingExpected); - Assert.assertTrue(isMissing(value), "Attribute " + missingExpected + " missing in one but not in other" ); - } - } - - private static final boolean isMissing(final Object value) { - if ( value == null ) return true; - else if ( value.equals(VCFConstants.MISSING_VALUE_v4) ) return true; - else if ( value instanceof List ) { - // handles the case where all elements are null or the list is empty - for ( final Object elt : (List)value) - if ( elt != null ) - return false; - return true; - } else - return false; - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java deleted file mode 100644 index 84bc6e080..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java +++ /dev/null @@ -1,248 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk; - -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.commandline.ArgumentException; -import org.broadinstitute.sting.commandline.Tags; -import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; -import org.broadinstitute.sting.gatk.iterators.ReadTransformer; -import org.broadinstitute.sting.gatk.walkers.Walker; -import org.broadinstitute.sting.gatk.walkers.readutils.PrintReads; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.IOException; -import java.io.PrintWriter; -import java.util.*; - -/** - * Tests selected functionality in the GenomeAnalysisEngine class - */ -public class GenomeAnalysisEngineUnitTest extends BaseTest { - - @Test(expectedExceptions=UserException.class) - public void testDuplicateSamFileHandlingSingleDuplicate() throws Exception { - GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); - - Collection samFiles = new ArrayList(); - samFiles.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags())); - samFiles.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags())); - - testEngine.setSAMFileIDs(samFiles); - testEngine.checkForDuplicateSamFiles(); - } - - @Test(expectedExceptions=UserException.class) - public void testDuplicateSamFileHandlingMultipleDuplicates() throws Exception { - GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); - - Collection samFiles = new ArrayList(); - samFiles.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags())); - samFiles.add(new SAMReaderID(new File("public/testdata/exampleNORG.bam"), new Tags())); - samFiles.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags())); - samFiles.add(new SAMReaderID(new File("public/testdata/exampleNORG.bam"), new Tags())); - - testEngine.setSAMFileIDs(samFiles); - testEngine.checkForDuplicateSamFiles(); - } - - @Test(expectedExceptions=UserException.class) - public void testDuplicateSamFileHandlingAbsoluteVsRelativePath() { - GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); - - final File relativePathToBAMFile = new File("public/testdata/exampleBAM.bam"); - final File absolutePathToBAMFile = new File(relativePathToBAMFile.getAbsolutePath()); - Collection samFiles = new ArrayList(); - samFiles.add(new SAMReaderID(relativePathToBAMFile, new Tags())); - samFiles.add(new SAMReaderID(absolutePathToBAMFile, new Tags())); - - testEngine.setSAMFileIDs(samFiles); - testEngine.checkForDuplicateSamFiles(); - } - - @Test - public void testEmptyIntervalSetHandling() throws Exception { - GenomeLocParser genomeLocParser = new GenomeLocParser(ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000).getSequenceDictionary()); - - GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); - - testEngine.setWalker(new PrintReads()); - testEngine.setIntervals(new GenomeLocSortedSet(genomeLocParser)); - - testEngine.validateSuppliedIntervals(); - } - - @Test - public void testLoadWellFormedSampleRenameMapFile() throws IOException { - final File mapFile = createTestSampleRenameMapFile(Arrays.asList("/foo/bar/first.bam newSample1", - "/foo/bar/second.bam newSample2", - "/foo/bar2/third.bam newSample3")); - final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); - final Map renameMap = engine.loadSampleRenameMap(mapFile); - - Assert.assertEquals(renameMap.size(), 3, "Sample rename map was wrong size after loading from file"); - - final Iterator expectedResultsIterator = Arrays.asList("/foo/bar/first.bam", "newSample1", "/foo/bar/second.bam", "newSample2", "/foo/bar2/third.bam", "newSample3").iterator(); - while ( expectedResultsIterator.hasNext() ) { - final String expectedKey = expectedResultsIterator.next(); - final String expectedValue = expectedResultsIterator.next(); - - Assert.assertNotNull(renameMap.get(new SAMReaderID(expectedKey, new Tags())), String.format("Entry for %s not found in sample rename map", expectedKey)); - Assert.assertEquals(renameMap.get(new SAMReaderID(expectedKey, new Tags())), expectedValue, "Wrong value in sample rename map for " + expectedKey); - } - } - - @DataProvider(name = "MalformedSampleRenameMapFileDataProvider") - public Object[][] generateMalformedSampleRenameMapFiles() throws IOException { - final List tests = new ArrayList(); - - tests.add(new Object[]{"testLoadSampleRenameMapFileNonExistentFile", - new File("/foo/bar/nonexistent")}); - tests.add(new Object[]{"testLoadSampleRenameMapFileMalformedLine1", - createTestSampleRenameMapFile(Arrays.asList("/path/to/foo.bam"))}); - tests.add(new Object[]{"testLoadSampleRenameMapFileMalformedLine2", - createTestSampleRenameMapFile(Arrays.asList("/path/to/foo.bam newSample extraField"))}); - tests.add(new Object[]{"testLoadSampleRenameMapFileNonAbsoluteBamPath", - createTestSampleRenameMapFile(Arrays.asList("relative/path/to/foo.bam newSample"))}); - tests.add(new Object[]{"testLoadSampleRenameMapFileDuplicateBamPath", - createTestSampleRenameMapFile(Arrays.asList("/path/to/dupe.bam newSample1", - "/path/to/dupe.bam newSample2"))}); - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "MalformedSampleRenameMapFileDataProvider", expectedExceptions = UserException.class) - public void testLoadMalformedSampleRenameMapFile( final String testName, final File mapFile ) { - logger.info("Executing test " + testName); - - final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); - final Map renameMap = engine.loadSampleRenameMap(mapFile); - } - - private File createTestSampleRenameMapFile( final List contents ) throws IOException { - final File mapFile = createTempFile("TestSampleRenameMapFile", ".tmp"); - final PrintWriter writer = new PrintWriter(mapFile); - - for ( final String line : contents ) { - writer.println(line); - } - writer.close(); - - return mapFile; - } - - /////////////////////////////////////////////////// - // Test the ReadTransformer ordering enforcement // - /////////////////////////////////////////////////// - - public static class TestReadTransformer extends ReadTransformer { - - private OrderingConstraint orderingConstraint = OrderingConstraint.DO_NOT_CARE; - private boolean enabled; - - protected TestReadTransformer(final OrderingConstraint orderingConstraint) { - this.orderingConstraint = orderingConstraint; - enabled = true; - } - - // need this because PackageUtils will pick up this class as a possible ReadTransformer - protected TestReadTransformer() { - enabled = false; - } - - @Override - public OrderingConstraint getOrderingConstraint() { return orderingConstraint; } - - @Override - public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) { return ApplicationTime.HANDLED_IN_WALKER; } - - @Override - public boolean enabled() { return enabled; } - - @Override - public GATKSAMRecord apply(final GATKSAMRecord read) { return read; } - - } - - @DataProvider(name = "ReadTransformerData") - public Object[][] makeReadTransformerData() { - List tests = new ArrayList(); - - for ( final ReadTransformer.OrderingConstraint orderingConstraint1 : ReadTransformer.OrderingConstraint.values() ) { - for ( final ReadTransformer.OrderingConstraint orderingConstraint2 : ReadTransformer.OrderingConstraint.values() ) { - for ( final ReadTransformer.OrderingConstraint orderingConstraint3 : ReadTransformer.OrderingConstraint.values() ) { - tests.add(new Object[]{orderingConstraint1, orderingConstraint2, orderingConstraint3}); - } - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "ReadTransformerData") - public void testReadTransformer(final ReadTransformer.OrderingConstraint oc1, final ReadTransformer.OrderingConstraint oc2, final ReadTransformer.OrderingConstraint oc3) { - - final GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); - final List readTransformers = new ArrayList(3); - readTransformers.add(new TestReadTransformer(oc1)); - readTransformers.add(new TestReadTransformer(oc2)); - readTransformers.add(new TestReadTransformer(oc3)); - - final boolean shouldThrowException = numWithConstraint(ReadTransformer.OrderingConstraint.MUST_BE_FIRST, oc1, oc2, oc3) > 1 || - numWithConstraint(ReadTransformer.OrderingConstraint.MUST_BE_LAST, oc1, oc2, oc3) > 1; - - try { - testEngine.setReadTransformers(readTransformers); - - Assert.assertFalse(shouldThrowException); - Assert.assertEquals(testEngine.getReadTransformers().size(), 3); - - Assert.assertTrue(testEngine.getReadTransformers().get(1).getOrderingConstraint() != ReadTransformer.OrderingConstraint.MUST_BE_FIRST); - Assert.assertTrue(testEngine.getReadTransformers().get(2).getOrderingConstraint() != ReadTransformer.OrderingConstraint.MUST_BE_FIRST); - Assert.assertTrue(testEngine.getReadTransformers().get(0).getOrderingConstraint() != ReadTransformer.OrderingConstraint.MUST_BE_LAST); - Assert.assertTrue(testEngine.getReadTransformers().get(1).getOrderingConstraint() != ReadTransformer.OrderingConstraint.MUST_BE_LAST); - } catch (UserException.IncompatibleReadFiltersException e) { - Assert.assertTrue(shouldThrowException); - } - } - - private int numWithConstraint(final ReadTransformer.OrderingConstraint target, final ReadTransformer.OrderingConstraint... constraints ) { - int count = 0; - for ( final ReadTransformer.OrderingConstraint constraint : constraints ) { - if ( constraint == target ) - count++; - } - return count; - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/CheckPileupIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/CheckPileupIntegrationTest.java deleted file mode 100644 index 4d3741228..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/CheckPileupIntegrationTest.java +++ /dev/null @@ -1,50 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.walkers.qc; - -import org.testng.annotations.Test; -import org.broadinstitute.sting.WalkerTest; - -import java.util.Collections; - -/** - * Run validating pileup across a set of core data as proof of the integrity of the GATK core. - * - * @author mhanna - * @version 0.1 - */ -public class CheckPileupIntegrationTest extends WalkerTest { - @Test(enabled = true) - public void testEcoliThreaded() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T CheckPileup" + - " -I " + validationDataLocation + "MV1994.selected.bam" + - " -R " + validationDataLocation + "Escherichia_coli_K12_MG1655.fasta" + - " --pileup:SAMPileup "+ validationDataLocation + "MV1994.selected.pileup" + - " -S SILENT -nt 8",0, Collections.emptyList()); - executeTest("testEcoliThreaded",spec); - } -} diff --git a/public/java/test/org/broadinstitute/sting/utils/MWUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/MWUnitTest.java deleted file mode 100644 index 9d4c562c7..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/MWUnitTest.java +++ /dev/null @@ -1,131 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils; - -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.collections.Pair; - -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; -import org.testng.Assert; - -/** - * Created by IntelliJ IDEA. - * User: Ghost - * Date: 3/5/11 - * Time: 2:06 PM - * To change this template use File | Settings | File Templates. - */ -public class MWUnitTest extends BaseTest { - @BeforeClass - public void init() { } - - @Test - private void testMWU() { - logger.warn("Testing MWU"); - MannWhitneyU mwu = new MannWhitneyU(); - mwu.add(0, MannWhitneyU.USet.SET1); - mwu.add(1,MannWhitneyU.USet.SET2); - mwu.add(2,MannWhitneyU.USet.SET2); - mwu.add(3,MannWhitneyU.USet.SET2); - mwu.add(4,MannWhitneyU.USet.SET2); - mwu.add(5,MannWhitneyU.USet.SET2); - mwu.add(6,MannWhitneyU.USet.SET1); - mwu.add(7,MannWhitneyU.USet.SET1); - mwu.add(8,MannWhitneyU.USet.SET1); - mwu.add(9,MannWhitneyU.USet.SET1); - mwu.add(10,MannWhitneyU.USet.SET1); - mwu.add(11,MannWhitneyU.USet.SET2); - Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(), MannWhitneyU.USet.SET1),25L); - Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(),MannWhitneyU.USet.SET2),11L); - - MannWhitneyU mwu2 = new MannWhitneyU(); - MannWhitneyU mwuNoDither = new MannWhitneyU(false); - for ( int dp : new int[]{2,4,5,6,8} ) { - mwu2.add(dp,MannWhitneyU.USet.SET1); - mwuNoDither.add(dp,MannWhitneyU.USet.SET1); - } - - for ( int dp : new int[]{1,3,7,9,10,11,12,13} ) { - mwu2.add(dp,MannWhitneyU.USet.SET2); - mwuNoDither.add(dp,MannWhitneyU.USet.SET2); - } - - MannWhitneyU.ExactMode pm = MannWhitneyU.ExactMode.POINT; - MannWhitneyU.ExactMode cm = MannWhitneyU.ExactMode.CUMULATIVE; - - // tests using the hypothesis that set 2 dominates set 1 (U value = 10) - Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu2.getObservations(),MannWhitneyU.USet.SET1),10L); - Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu2.getObservations(),MannWhitneyU.USet.SET2),30L); - Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwuNoDither.getObservations(),MannWhitneyU.USet.SET1),10L); - Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwuNoDither.getObservations(),MannWhitneyU.USet.SET2),30L); - - Pair sizes = mwu2.getSetSizes(); - - Assert.assertEquals(MannWhitneyU.calculatePUniformApproximation(sizes.first,sizes.second,10L),0.4180519701814064,1e-14); - Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.first,sizes.second,10L,false,pm).second,0.021756021756021756,1e-14); - Assert.assertEquals(MannWhitneyU.calculatePNormalApproximation(sizes.first,sizes.second,10L,false).second,0.06214143703127617,1e-14); - logger.warn("Testing two-sided"); - Assert.assertEquals((double)mwu2.runTwoSidedTest().second,2*0.021756021756021756,1e-8); - - // tests using the hypothesis that set 1 dominates set 2 (U value = 30) -- empirical should be identical, normall approx close, uniform way off - Assert.assertEquals(MannWhitneyU.calculatePNormalApproximation(sizes.second,sizes.first,30L,true).second,2.0*0.08216463976903321,1e-14); - Assert.assertEquals(MannWhitneyU.calculatePUniformApproximation(sizes.second,sizes.first,30L),0.0023473625009328147,1e-14); - Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,30L,false,pm).second,0.021756021756021756,1e-14); // note -- exactly same value as above - Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,29L,false,cm).second,1.0-0.08547008547008,1e-14); // r does a correction, subtracting 1 from U - Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,11L,false,cm).second,0.08547008547008,1e-14); // r does a correction, subtracting 1 from U - Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,11L,false,cm).first,-1.36918910442,1e-2); // apache inversion set to be good only to 1e-2 - Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,29L,false,cm).first,1.36918910442,1e-2); // apache inversion set to be good only to 1e-2 - Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,29L,false,pm).first,1.2558754796642067,1e-8); // PDF should be similar - Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,11L,false,pm).first,-1.2558754796642067,1e-8); // PDF should be similar - Assert.assertEquals(MannWhitneyU.calculatePRecursively(4,5,10L,false,pm).second,0.0952381,1e-5); - Assert.assertEquals(MannWhitneyU.calculatePRecursively(4,5,10L,false,pm).first,0.0,1e-14); - - logger.warn("Set 1"); - Assert.assertEquals((double)mwu2.runOneSidedTest(MannWhitneyU.USet.SET1).second,0.021756021756021756,1e-8); - logger.warn("Set 2"); - Assert.assertEquals((double)mwu2.runOneSidedTest(MannWhitneyU.USet.SET2).second,0.021756021756021756,1e-8); - - MannWhitneyU mwu3 = new MannWhitneyU(); - for ( int dp : new int[]{0,2,4} ) { - mwu3.add(dp,MannWhitneyU.USet.SET1); - } - for ( int dp : new int[]{1,5,6,7,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34} ) { - mwu3.add(dp,MannWhitneyU.USet.SET2); - } - long u = MannWhitneyU.calculateOneSidedU(mwu3.getObservations(),MannWhitneyU.USet.SET1); - //logger.warn(String.format("U is: %d",u)); - Pair nums = mwu3.getSetSizes(); - //logger.warn(String.format("Corrected p is: %.4e",MannWhitneyU.calculatePRecursivelyDoNotCheckValuesEvenThoughItIsSlow(nums.first,nums.second,u))); - //logger.warn(String.format("Counted sequences: %d",MannWhitneyU.countSequences(nums.first, nums.second, u))); - //logger.warn(String.format("Possible sequences: %d", (long) Arithmetic.binomial(nums.first+nums.second,nums.first))); - //logger.warn(String.format("Ratio: %.4e",MannWhitneyU.countSequences(nums.first,nums.second,u)/Arithmetic.binomial(nums.first+nums.second,nums.first))); - Assert.assertEquals(MannWhitneyU.calculatePRecursivelyDoNotCheckValuesEvenThoughItIsSlow(nums.first, nums.second, u), 3.665689149560116E-4, 1e-14); - Assert.assertEquals(MannWhitneyU.calculatePNormalApproximation(nums.first,nums.second,u,false).second,0.0032240865760884696,1e-14); - Assert.assertEquals(MannWhitneyU.calculatePUniformApproximation(nums.first,nums.second,u),0.0026195003025784036,1e-14); - - } -} diff --git a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java deleted file mode 100644 index de049fe89..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java +++ /dev/null @@ -1,859 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils; - -import cern.jet.random.Normal; -import org.apache.commons.lang.ArrayUtils; -import org.broadinstitute.sting.BaseTest; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.*; - -/** - * Basic unit test for MathUtils - */ -public class MathUtilsUnitTest extends BaseTest { - @BeforeClass - public void init() { - } - - /** - * Tests that we get unqiue values for the valid (non-null-producing) input space for {@link MathUtils#fastGenerateUniqueHashFromThreeIntegers(int, int, int)}. - */ - @Test - public void testGenerateUniqueHashFromThreePositiveIntegers() { - logger.warn("Executing testGenerateUniqueHashFromThreePositiveIntegers"); - - final Set observedLongs = new HashSet(); - for (short i = 0; i < Byte.MAX_VALUE; i++) { - for (short j = 0; j < Byte.MAX_VALUE; j++) { - for (short k = 0; k < Byte.MAX_VALUE; k++) { - final Long aLong = MathUtils.fastGenerateUniqueHashFromThreeIntegers(i, j, k); - //System.out.println(String.format("%s, %s, %s: %s", i, j, k, aLong)); - Assert.assertTrue(observedLongs.add(aLong)); - } - } - } - - for (short i = Byte.MAX_VALUE; i <= Short.MAX_VALUE && i > 0; i += 128) { - for (short j = Byte.MAX_VALUE; j <= Short.MAX_VALUE && j > 0; j += 128) { - for (short k = Byte.MAX_VALUE; k <= Short.MAX_VALUE && k > 0; k += 128) { - final Long aLong = MathUtils.fastGenerateUniqueHashFromThreeIntegers(i, j, k); - // System.out.println(String.format("%s, %s, %s: %s", i, j, k, aLong)); - Assert.assertTrue(observedLongs.add(aLong)); - } - } - } - } - - /** - * Tests that we get the right values from the binomial distribution - */ - @Test - public void testBinomialProbability() { - logger.warn("Executing testBinomialProbability"); - - Assert.assertEquals(MathUtils.binomialProbability(3, 2, 0.5), 0.375, 0.0001); - Assert.assertEquals(MathUtils.binomialProbability(100, 10, 0.5), 1.365543e-17, 1e-18); - Assert.assertEquals(MathUtils.binomialProbability(217, 73, 0.02), 4.521904e-67, 1e-68); - Assert.assertEquals(MathUtils.binomialProbability(300, 100, 0.02), 9.27097e-91, 1e-92); - Assert.assertEquals(MathUtils.binomialProbability(300, 150, 0.98), 6.462892e-168, 1e-169); - Assert.assertEquals(MathUtils.binomialProbability(300, 120, 0.98), 3.090054e-221, 1e-222); - Assert.assertEquals(MathUtils.binomialProbability(300, 112, 0.98), 2.34763e-236, 1e-237); - } - - /** - * Tests that we get the right values from the binomial distribution - */ - @Test - public void testCumulativeBinomialProbability() { - logger.warn("Executing testCumulativeBinomialProbability"); - - for (int j = 0; j < 2; j++) { // Test memoizing functionality, as well. - final int numTrials = 10; - for ( int i = 0; i < numTrials; i++ ) - Assert.assertEquals(MathUtils.binomialCumulativeProbability(numTrials, i, i), MathUtils.binomialProbability(numTrials, i), 1e-10, String.format("k=%d, n=%d", i, numTrials)); - - Assert.assertEquals(MathUtils.binomialCumulativeProbability(10, 0, 2), 0.05468750, 1e-7); - Assert.assertEquals(MathUtils.binomialCumulativeProbability(10, 0, 5), 0.62304687, 1e-7); - Assert.assertEquals(MathUtils.binomialCumulativeProbability(10, 0, 10), 1.0, 1e-7); - } - } - - /** - * Tests that we get the right values from the multinomial distribution - */ - @Test - public void testMultinomialProbability() { - logger.warn("Executing testMultinomialProbability"); - - int[] counts0 = {2, 0, 1}; - double[] probs0 = {0.33, 0.33, 0.34}; - Assert.assertEquals(MathUtils.multinomialProbability(counts0, probs0), 0.111078, 1e-6); - - int[] counts1 = {10, 20, 30}; - double[] probs1 = {0.25, 0.25, 0.50}; - Assert.assertEquals(MathUtils.multinomialProbability(counts1, probs1), 0.002870301, 1e-9); - - int[] counts2 = {38, 82, 50, 36}; - double[] probs2 = {0.25, 0.25, 0.25, 0.25}; - Assert.assertEquals(MathUtils.multinomialProbability(counts2, probs2), 1.88221e-09, 1e-10); - - int[] counts3 = {1, 600, 1}; - double[] probs3 = {0.33, 0.33, 0.34}; - Assert.assertEquals(MathUtils.multinomialProbability(counts3, probs3), 5.20988e-285, 1e-286); - } - - /** - * Tests that the random index selection is working correctly - */ - @Test - public void testRandomIndicesWithReplacement() { - logger.warn("Executing testRandomIndicesWithReplacement"); - - // Check that the size of the list returned is correct - Assert.assertTrue(MathUtils.sampleIndicesWithReplacement(5, 0).size() == 0); - Assert.assertTrue(MathUtils.sampleIndicesWithReplacement(5, 1).size() == 1); - Assert.assertTrue(MathUtils.sampleIndicesWithReplacement(5, 5).size() == 5); - Assert.assertTrue(MathUtils.sampleIndicesWithReplacement(5, 1000).size() == 1000); - - // Check that the list contains only the k element range that as asked for - no more, no less - List Five = new ArrayList(); - Collections.addAll(Five, 0, 1, 2, 3, 4); - List BigFive = MathUtils.sampleIndicesWithReplacement(5, 10000); - Assert.assertTrue(BigFive.containsAll(Five)); - Assert.assertTrue(Five.containsAll(BigFive)); - } - - /** - * Tests that we get the right values from the multinomial distribution - */ - @Test - public void testSliceListByIndices() { - logger.warn("Executing testSliceListByIndices"); - - // Check that the list contains only the k element range that as asked for - no more, no less but now - // use the index list to pull elements from another list using sliceListByIndices - List Five = new ArrayList(); - Collections.addAll(Five, 0, 1, 2, 3, 4); - List FiveAlpha = new ArrayList(); - Collections.addAll(FiveAlpha, 'a', 'b', 'c', 'd', 'e'); - List BigFive = MathUtils.sampleIndicesWithReplacement(5, 10000); - List BigFiveAlpha = MathUtils.sliceListByIndices(BigFive, FiveAlpha); - Assert.assertTrue(BigFiveAlpha.containsAll(FiveAlpha)); - Assert.assertTrue(FiveAlpha.containsAll(BigFiveAlpha)); - } - - /** - * Tests that we correctly compute mean and standard deviation from a stream of numbers - */ - @Test - public void testRunningAverage() { - logger.warn("Executing testRunningAverage"); - - int[] numbers = {1, 2, 4, 5, 3, 128, 25678, -24}; - MathUtils.RunningAverage r = new MathUtils.RunningAverage(); - - for (int i = 0; i < numbers.length; i++) - r.add((double) numbers[i]); - - Assert.assertEquals((long) numbers.length, r.observationCount()); - Assert.assertTrue(r.mean() - 3224.625 < 2e-10); - Assert.assertTrue(r.stddev() - 9072.6515881128 < 2e-10); - } - - @Test - public void testLog10Gamma() { - logger.warn("Executing testLog10Gamma"); - - Assert.assertEquals(MathUtils.log10Gamma(4.0), 0.7781513, 1e-6); - Assert.assertEquals(MathUtils.log10Gamma(10), 5.559763, 1e-6); - Assert.assertEquals(MathUtils.log10Gamma(10654), 38280.53, 1e-2); - } - - @Test - public void testLog10BinomialCoefficient() { - logger.warn("Executing testLog10BinomialCoefficient"); - // note that we can test the binomial coefficient calculation indirectly via Newton's identity - // (1+z)^m = sum (m choose k)z^k - double[] z_vals = new double[]{0.999,0.9,0.8,0.5,0.2,0.01,0.0001}; - int[] exponent = new int[]{5,15,25,50,100}; - for ( double z : z_vals ) { - double logz = Math.log10(z); - for ( int exp : exponent ) { - double expected_log = exp*Math.log10(1+z); - double[] newtonArray_log = new double[1+exp]; - for ( int k = 0 ; k <= exp; k++ ) { - newtonArray_log[k] = MathUtils.log10BinomialCoefficient(exp,k)+k*logz; - } - Assert.assertEquals(MathUtils.log10sumLog10(newtonArray_log),expected_log,1e-6); - } - } - - Assert.assertEquals(MathUtils.log10BinomialCoefficient(4, 2), 0.7781513, 1e-6); - Assert.assertEquals(MathUtils.log10BinomialCoefficient(10, 3), 2.079181, 1e-6); - Assert.assertEquals(MathUtils.log10BinomialCoefficient(103928, 119), 400.2156, 1e-4); - } - - @Test - public void testFactorial() { - logger.warn("Executing testFactorial"); - Assert.assertEquals((int) MathUtils.factorial(4), 24); - Assert.assertEquals((int) MathUtils.factorial(10), 3628800); - Assert.assertEquals((int) MathUtils.factorial(12), 479001600); - } - - @Test - public void testLog10Factorial() { - logger.warn("Executing testLog10Factorial"); - Assert.assertEquals(MathUtils.log10Factorial(4), 1.380211, 1e-6); - Assert.assertEquals(MathUtils.log10Factorial(10), 6.559763, 1e-6); - Assert.assertEquals(MathUtils.log10Factorial(12), 8.680337, 1e-6); - Assert.assertEquals(MathUtils.log10Factorial(200), 374.8969, 1e-3); - Assert.assertEquals(MathUtils.log10Factorial(12342), 45138.26, 1e-1); - double log10factorial_small = 0; - double log10factorial_middle = 374.8969; - double log10factorial_large = 45138.26; - int small_start = 1; - int med_start = 200; - int large_start = 12342; - for ( int i = 1; i < 1000; i++ ) { - log10factorial_small += Math.log10(i+small_start); - log10factorial_middle += Math.log10(i+med_start); - log10factorial_large += Math.log10(i+large_start); - Assert.assertEquals(MathUtils.log10Factorial(small_start+i),log10factorial_small,1e-6); - Assert.assertEquals(MathUtils.log10Factorial(med_start+i),log10factorial_middle,1e-3); - Assert.assertEquals(MathUtils.log10Factorial(large_start+i),log10factorial_large,1e-1); - } - } - - /** - * Private functions used by testArrayShuffle() - */ - private boolean hasUniqueElements(Object[] x) { - for (int i = 0; i < x.length; i++) - for (int j = i + 1; j < x.length; j++) - if (x[i].equals(x[j]) || x[i] == x[j]) - return false; - return true; - } - - private boolean hasAllElements(final Object[] expected, final Object[] actual) { - HashSet set = new HashSet(); - set.addAll(Arrays.asList(expected)); - set.removeAll(Arrays.asList(actual)); - return set.isEmpty(); - } - - @Test - public void testApproximateLog10SumLog10() { - - final double requiredPrecision = 1E-4; - - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0}), 0.0, requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-5.15}), -5.15, requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {130.0}), 130.0, requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-0.145}), -0.145, requiredPrecision); - - Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, 0.0), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-1.0, 0.0), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, -1.0), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-2.2, -3.5), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-1.0, -7.1), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(5.0, 6.2), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(38.1, 16.2), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-38.1, 6.2), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-19.1, -37.1), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-29.1, -27.6), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-0.12345, -0.23456), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-15.7654, -17.0101), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-0.12345, Double.NEGATIVE_INFINITY), -0.12345, requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-15.7654, Double.NEGATIVE_INFINITY), -15.7654, requiredPrecision); - - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0, 0.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-1.0, 0.0}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0, -1.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-2.2, -3.5}), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-1.0, -7.1}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {5.0, 6.2}), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {38.1, 16.2}), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-38.1, 6.2}), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-19.1, -37.1}), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-29.1, -27.6}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-0.12345, -0.23456}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-15.7654, -17.0101}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101)), requiredPrecision); - - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0, 0.0, 0.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-1.0, 0.0, 0.0}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0, -1.0, -2.5}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0) + Math.pow(10.0, -2.5)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-2.2, -3.5, -1.1}), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5) + Math.pow(10.0, -1.1)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-1.0, -7.1, 0.5}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1) + Math.pow(10.0, 0.5)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {5.0, 6.2, 1.3}), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2) + Math.pow(10.0, 1.3)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {38.1, 16.2, 18.1}), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2) + Math.pow(10.0, 18.1)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-38.1, 6.2, 26.6}), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2) + Math.pow(10.0, 26.6)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-19.1, -37.1, -45.1}), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1) + Math.pow(10.0, -45.1)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-29.1, -27.6, -26.2}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6) + Math.pow(10.0, -26.2)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-0.12345, -0.23456, -0.34567}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456) + Math.pow(10.0, -0.34567)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-15.7654, -17.0101, -17.9341}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101) + Math.pow(10.0, -17.9341)), requiredPrecision); - - Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, 0.0, 0.0), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-1.0, 0.0, 0.0), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, -1.0, -2.5), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0) + Math.pow(10.0, -2.5)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-2.2, -3.5, -1.1), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5) + Math.pow(10.0, -1.1)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-1.0, -7.1, 0.5), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1) + Math.pow(10.0, 0.5)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(5.0, 6.2, 1.3), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2) + Math.pow(10.0, 1.3)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(38.1, 16.2, 18.1), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2) + Math.pow(10.0, 18.1)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-38.1, 6.2, 26.6), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2) + Math.pow(10.0, 26.6)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-19.1, -37.1, -45.1), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1) + Math.pow(10.0, -45.1)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-29.1, -27.6, -26.2), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6) + Math.pow(10.0, -26.2)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-0.12345, -0.23456, -0.34567), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456) + Math.pow(10.0, -0.34567)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-15.7654, -17.0101, -17.9341), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101) + Math.pow(10.0, -17.9341)), requiredPrecision); - - // magnitude of the sum doesn't matter, so we can combinatorially test this via partitions of unity - double[] mult_partitionFactor = new double[]{0.999,0.98,0.95,0.90,0.8,0.5,0.3,0.1,0.05,0.001}; - int[] n_partitions = new int[] {2,4,8,16,32,64,128,256,512,1028}; - for ( double alpha : mult_partitionFactor ) { - double log_alpha = Math.log10(alpha); - double log_oneMinusAlpha = Math.log10(1-alpha); - for ( int npart : n_partitions ) { - double[] multiplicative = new double[npart]; - double[] equal = new double[npart]; - double remaining_log = 0.0; // realspace = 1 - for ( int i = 0 ; i < npart-1; i++ ) { - equal[i] = -Math.log10(npart); - double piece = remaining_log + log_alpha; // take a*remaining, leaving remaining-a*remaining = (1-a)*remaining - multiplicative[i] = piece; - remaining_log = remaining_log + log_oneMinusAlpha; - } - equal[npart-1] = -Math.log10(npart); - multiplicative[npart-1] = remaining_log; - Assert.assertEquals(MathUtils.approximateLog10SumLog10(equal),0.0,requiredPrecision,String.format("Did not sum to one: k=%d equal partitions.",npart)); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(multiplicative),0.0,requiredPrecision, String.format("Did not sum to one: k=%d multiplicative partitions with alpha=%f",npart,alpha)); - } - } - } - - @Test - public void testLog10sumLog10() { - final double requiredPrecision = 1E-14; - - final double log3 = 0.477121254719662; - Assert.assertEquals(MathUtils.log10sumLog10(new double[]{0.0, 0.0, 0.0}), log3, requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {0.0, 0.0, 0.0}, 0), log3, requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[]{0.0, 0.0, 0.0}, 0, 3), log3, requiredPrecision); - - final double log2 = 0.301029995663981; - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {0.0, 0.0, 0.0}, 0, 2), log2, requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {0.0, 0.0, 0.0}, 0, 1), 0.0, requiredPrecision); - - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {0.0}), 0.0, requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-5.15}), -5.15, requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {130.0}), 130.0, requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-0.145}), -0.145, requiredPrecision); - - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {0.0, 0.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-1.0, 0.0}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {0.0, -1.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-2.2, -3.5}), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-1.0, -7.1}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {5.0, 6.2}), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {38.1, 16.2}), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-38.1, 6.2}), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-19.1, -37.1}), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-29.1, -27.6}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-0.12345, -0.23456}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-15.7654, -17.0101}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101)), requiredPrecision); - - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {0.0, 0.0, 0.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-1.0, 0.0, 0.0}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {0.0, -1.0, -2.5}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0) + Math.pow(10.0, -2.5)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-2.2, -3.5, -1.1}), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5) + Math.pow(10.0, -1.1)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-1.0, -7.1, 0.5}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1) + Math.pow(10.0, 0.5)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {5.0, 6.2, 1.3}), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2) + Math.pow(10.0, 1.3)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {38.1, 16.2, 18.1}), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2) + Math.pow(10.0, 18.1)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-38.1, 6.2, 26.6}), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2) + Math.pow(10.0, 26.6)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-19.1, -37.1, -45.1}), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1) + Math.pow(10.0, -45.1)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-29.1, -27.6, -26.2}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6) + Math.pow(10.0, -26.2)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-0.12345, -0.23456, -0.34567}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456) + Math.pow(10.0, -0.34567)), requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-15.7654, -17.0101, -17.9341}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101) + Math.pow(10.0, -17.9341)), requiredPrecision); - - // magnitude of the sum doesn't matter, so we can combinatorially test this via partitions of unity - double[] mult_partitionFactor = new double[]{0.999,0.98,0.95,0.90,0.8,0.5,0.3,0.1,0.05,0.001}; - int[] n_partitions = new int[] {2,4,8,16,32,64,128,256,512,1028}; - for ( double alpha : mult_partitionFactor ) { - double log_alpha = Math.log10(alpha); - double log_oneMinusAlpha = Math.log10(1-alpha); - for ( int npart : n_partitions ) { - double[] multiplicative = new double[npart]; - double[] equal = new double[npart]; - double remaining_log = 0.0; // realspace = 1 - for ( int i = 0 ; i < npart-1; i++ ) { - equal[i] = -Math.log10(npart); - double piece = remaining_log + log_alpha; // take a*remaining, leaving remaining-a*remaining = (1-a)*remaining - multiplicative[i] = piece; - remaining_log = remaining_log + log_oneMinusAlpha; - } - equal[npart-1] = -Math.log10(npart); - multiplicative[npart-1] = remaining_log; - Assert.assertEquals(MathUtils.log10sumLog10(equal),0.0,requiredPrecision); - Assert.assertEquals(MathUtils.log10sumLog10(multiplicative),0.0,requiredPrecision,String.format("Did not sum to one: nPartitions=%d, alpha=%f",npart,alpha)); - } - } - } - - @Test - public void testLogDotProduct() { - Assert.assertEquals(MathUtils.logDotProduct(new double[]{-5.0,-3.0,2.0}, new double[]{6.0,7.0,8.0}),10.0,1e-3); - Assert.assertEquals(MathUtils.logDotProduct(new double[]{-5.0}, new double[]{6.0}),1.0,1e-3); - } - - @Test - public void testNormalDistribution() { - final double requiredPrecision = 1E-10; - - final Normal n = new Normal(0.0, 1.0, null); - for( final double mu : new double[]{-5.0, -3.2, -1.5, 0.0, 1.2, 3.0, 5.8977} ) { - for( final double sigma : new double[]{1.2, 3.0, 5.8977} ) { - for( final double x : new double[]{-5.0, -3.2, -1.5, 0.0, 1.2, 3.0, 5.8977} ) { - n.setState(mu, sigma); - Assert.assertEquals(n.pdf(x), MathUtils.normalDistribution(mu, sigma, x), requiredPrecision); - Assert.assertEquals(Math.log10(n.pdf(x)), MathUtils.normalDistributionLog10(mu, sigma, x), requiredPrecision); - } - } - } - } - - @DataProvider(name = "ArrayMinData") - public Object[][] makeArrayMinData() { - List tests = new ArrayList(); - - // this functionality can be adapted to provide input data for whatever you might want in your data - tests.add(new Object[]{Arrays.asList(10), 10}); - tests.add(new Object[]{Arrays.asList(-10), -10}); - - for ( final List values : Utils.makePermutations(Arrays.asList(1,2,3), 3, false) ) { - tests.add(new Object[]{values, 1}); - } - - for ( final List values : Utils.makePermutations(Arrays.asList(1,2,-3), 3, false) ) { - tests.add(new Object[]{values, -3}); - } - - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "ArrayMinData") - public void testArrayMinList(final List values, final int expected) { - final int actual = MathUtils.arrayMin(values); - Assert.assertEquals(actual, expected, "Failed with " + values); - } - - @Test(dataProvider = "ArrayMinData") - public void testArrayMinIntArray(final List values, final int expected) { - final int[] asArray = ArrayUtils.toPrimitive(values.toArray(new Integer[values.size()])); - final int actual = MathUtils.arrayMin(asArray); - Assert.assertEquals(actual, expected, "Failed with " + values); - } - - @Test(dataProvider = "ArrayMinData") - public void testArrayMinByteArray(final List values, final int expected) { - final byte[] asArray = new byte[values.size()]; - for ( int i = 0; i < values.size(); i++ ) asArray[i] = (byte)(values.get(i) & 0xFF); - final byte actual = MathUtils.arrayMin(asArray); - Assert.assertEquals(actual, (byte)(expected & 0xFF), "Failed with " + values); - } - - @Test(dataProvider = "ArrayMinData") - public void testArrayMinDoubleArray(final List values, final int expected) { - final double[] asArray = new double[values.size()]; - for ( int i = 0; i < values.size(); i++ ) asArray[i] = (double)(values.get(i)); - final double actual = MathUtils.arrayMin(asArray); - Assert.assertEquals(actual, (double)expected, "Failed with " + values); - } - - @DataProvider(name = "MedianData") - public Object[][] makeMedianData() { - final List tests = new ArrayList<>(); - - // this functionality can be adapted to provide input data for whatever you might want in your data - tests.add(new Object[]{Arrays.asList(10), 10}); - tests.add(new Object[]{Arrays.asList(1, 10), 10}); - - for ( final List values : Utils.makePermutations(Arrays.asList(1,2,-3), 3, false) ) { - tests.add(new Object[]{values, 1}); - } - - for ( final List values : Utils.makePermutations(Arrays.asList(1.1,2.1,-3.1), 3, false) ) { - tests.add(new Object[]{values, 1.1}); - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "MedianData") - public void testMedian(final List values, final Comparable expected) { - final Comparable actual = MathUtils.median(values); - Assert.assertEquals(actual, expected, "Failed with " + values); - } - - - - // man. All this to test dirichlet. - - private double[] unwrap(List stuff) { - double[] unwrapped = new double[stuff.size()]; - int idx = 0; - for ( Double d : stuff ) { - unwrapped[idx++] = d == null ? 0.0 : d; - } - - return unwrapped; - } - - /** - * The PartitionGenerator generates all of the partitions of a number n, e.g. - * 5 + 0 - * 4 + 1 - * 3 + 2 - * 3 + 1 + 1 - * 2 + 2 + 1 - * 2 + 1 + 1 + 1 - * 1 + 1 + 1 + 1 + 1 - * - * This is used to help enumerate the state space over which the Dirichlet-Multinomial is defined, - * to ensure that the distribution function is properly implemented - */ - class PartitionGenerator implements Iterator> { - // generate the partitions of an integer, each partition sorted numerically - int n; - List a; - int y; - int k; - int state; - int x; - int l; - - public PartitionGenerator(int n) { - this.n = n; - this.y = n - 1; - this.k = 1; - this.a = new ArrayList(); - for ( int i = 0; i < n; i++ ) { - this.a.add(i); - } - this.state = 0; - } - - public void remove() { /* do nothing */ } - - public boolean hasNext() { return ! ( this.k == 0 && state == 0 ); } - - private String dataStr() { - return String.format("a = [%s] k = %d y = %d state = %d x = %d l = %d", - Utils.join(",",a), k, y, state, x, l); - } - - public List next() { - if ( this.state == 0 ) { - this.x = a.get(k-1)+1; - k -= 1; - this.state = 1; - } - - if ( this.state == 1 ) { - while ( 2*x <= y ) { - this.a.set(k,x); - this.y -= x; - this.k++; - } - this.l = 1+this.k; - this.state = 2; - } - - if ( this.state == 2 ) { - if ( x <= y ) { - this.a.set(k,x); - this.a.set(l,y); - x += 1; - y -= 1; - return this.a.subList(0, this.k + 2); - } else { - this.state =3; - } - } - - if ( this.state == 3 ) { - this.a.set(k,x+y); - this.y = x + y - 1; - this.state = 0; - return a.subList(0, k + 1); - } - - throw new IllegalStateException("Cannot get here"); - } - - public String toString() { - StringBuffer buf = new StringBuffer(); - buf.append("{ "); - while ( hasNext() ) { - buf.append("["); - buf.append(Utils.join(",",next())); - buf.append("],"); - } - buf.deleteCharAt(buf.lastIndexOf(",")); - buf.append(" }"); - return buf.toString(); - } - - } - - /** - * NextCounts is the enumerator over the state space of the multinomial dirichlet. - * - * It filters the partition of the total sum to only those with a number of terms - * equal to the number of categories. - * - * It then generates all permutations of that partition. - * - * In so doing it enumerates over the full state space. - */ - class NextCounts implements Iterator { - - private PartitionGenerator partitioner; - private int numCategories; - private int[] next; - - public NextCounts(int numCategories, int totalCounts) { - partitioner = new PartitionGenerator(totalCounts); - this.numCategories = numCategories; - next = nextFromPartitioner(); - } - - public void remove() { /* do nothing */ } - - public boolean hasNext() { return next != null; } - - public int[] next() { - int[] toReturn = clone(next); - next = nextPermutation(); - if ( next == null ) { - next = nextFromPartitioner(); - } - - return toReturn; - } - - private int[] clone(int[] arr) { - int[] a = new int[arr.length]; - for ( int idx = 0; idx < a.length ; idx ++) { - a[idx] = arr[idx]; - } - - return a; - } - - private int[] nextFromPartitioner() { - if ( partitioner.hasNext() ) { - List nxt = partitioner.next(); - while ( partitioner.hasNext() && nxt.size() > numCategories ) { - nxt = partitioner.next(); - } - - if ( nxt.size() > numCategories ) { - return null; - } else { - int[] buf = new int[numCategories]; - for ( int idx = 0; idx < nxt.size(); idx++ ) { - buf[idx] = nxt.get(idx); - } - Arrays.sort(buf); - return buf; - } - } - - return null; - } - - public int[] nextPermutation() { - return MathUtilsUnitTest.nextPermutation(next); - } - - } - - public static int[] nextPermutation(int[] next) { - // the counts can swap among each other. The int[] is originally in ascending order - // this generates the next array in lexicographic order descending - - // locate the last occurrence where next[k] < next[k+1] - int gt = -1; - for ( int idx = 0; idx < next.length-1; idx++) { - if ( next[idx] < next[idx+1] ) { - gt = idx; - } - } - - if ( gt == -1 ) { - return null; - } - - int largestLessThan = gt+1; - for ( int idx = 1 + largestLessThan; idx < next.length; idx++) { - if ( next[gt] < next[idx] ) { - largestLessThan = idx; - } - } - - int val = next[gt]; - next[gt] = next[largestLessThan]; - next[largestLessThan] = val; - - // reverse the tail of the array - int[] newTail = new int[next.length-gt-1]; - int ctr = 0; - for ( int idx = next.length-1; idx > gt; idx-- ) { - newTail[ctr++] = next[idx]; - } - - for ( int idx = 0; idx < newTail.length; idx++) { - next[gt+idx+1] = newTail[idx]; - } - - return next; - } - - - // before testing the dirichlet multinomial, we need to test the - // classes used to test the dirichlet multinomial - - @Test - public void testPartitioner() { - int[] numsToTest = new int[]{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20}; - int[] expectedSizes = new int[]{1, 2, 3, 5, 7, 11, 15, 22, 30, 42, 56, 77, 101, 135, 176, 231, 297, 385, 490, 627}; - for ( int testNum = 0; testNum < numsToTest.length; testNum++ ) { - PartitionGenerator gen = new PartitionGenerator(numsToTest[testNum]); - int size = 0; - while ( gen.hasNext() ) { - logger.debug(gen.dataStr()); - size += 1; - gen.next(); - } - Assert.assertEquals(size,expectedSizes[testNum], - String.format("Expected %d partitions, observed %s",expectedSizes[testNum],new PartitionGenerator(numsToTest[testNum]).toString())); - } - } - - @Test - public void testNextPermutation() { - int[] arr = new int[]{1,2,3,4}; - int[][] gens = new int[][] { - new int[]{1,2,3,4}, - new int[]{1,2,4,3}, - new int[]{1,3,2,4}, - new int[]{1,3,4,2}, - new int[]{1,4,2,3}, - new int[]{1,4,3,2}, - new int[]{2,1,3,4}, - new int[]{2,1,4,3}, - new int[]{2,3,1,4}, - new int[]{2,3,4,1}, - new int[]{2,4,1,3}, - new int[]{2,4,3,1}, - new int[]{3,1,2,4}, - new int[]{3,1,4,2}, - new int[]{3,2,1,4}, - new int[]{3,2,4,1}, - new int[]{3,4,1,2}, - new int[]{3,4,2,1}, - new int[]{4,1,2,3}, - new int[]{4,1,3,2}, - new int[]{4,2,1,3}, - new int[]{4,2,3,1}, - new int[]{4,3,1,2}, - new int[]{4,3,2,1} }; - for ( int gen = 0; gen < gens.length; gen ++ ) { - for ( int idx = 0; idx < 3; idx++ ) { - Assert.assertEquals(arr[idx],gens[gen][idx], - String.format("Error at generation %d, expected %s, observed %s",gen,Arrays.toString(gens[gen]),Arrays.toString(arr))); - } - arr = nextPermutation(arr); - } - } - - private double[] addEpsilon(double[] counts) { - double[] d = new double[counts.length]; - for ( int i = 0; i < counts.length; i ++ ) { - d[i] = counts[i] + 1e-3; - } - return d; - } - - @Test - public void testDirichletMultinomial() { - List testAlleles = Arrays.asList( - new double[]{80,240}, - new double[]{1,10000}, - new double[]{0,500}, - new double[]{5140,20480}, - new double[]{5000,800,200}, - new double[]{6,3,1000}, - new double[]{100,400,300,800}, - new double[]{8000,100,20,80,2}, - new double[]{90,20000,400,20,4,1280,720,1} - ); - - Assert.assertTrue(! Double.isInfinite(MathUtils.log10Gamma(1e-3)) && ! Double.isNaN(MathUtils.log10Gamma(1e-3))); - - int[] numAlleleSampled = new int[]{2,5,10,20,25}; - for ( double[] alleles : testAlleles ) { - for ( int count : numAlleleSampled ) { - // test that everything sums to one. Generate all multinomial draws - List likelihoods = new ArrayList(100000); - NextCounts generator = new NextCounts(alleles.length,count); - double maxLog = Double.MIN_VALUE; - //List countLog = new ArrayList(200); - while ( generator.hasNext() ) { - int[] thisCount = generator.next(); - //countLog.add(Arrays.toString(thisCount)); - Double likelihood = MathUtils.dirichletMultinomial(addEpsilon(alleles),thisCount); - Assert.assertTrue(! Double.isNaN(likelihood) && ! Double.isInfinite(likelihood), - String.format("Likelihood for counts %s and nAlleles %d was %s", - Arrays.toString(thisCount),alleles.length,Double.toString(likelihood))); - if ( likelihood > maxLog ) - maxLog = likelihood; - likelihoods.add(likelihood); - } - //System.out.printf("%d likelihoods and max is (probability) %e\n",likelihoods.size(),Math.pow(10,maxLog)); - Assert.assertEquals(MathUtils.sumLog10(unwrap(likelihoods)),1.0,1e-7, - String.format("Counts %d and alleles %d have nLikelihoods %d. \n Counts: %s", - count,alleles.length,likelihoods.size(), "NODEBUG"/*,countLog*/)); - } - } - } -} diff --git a/public/java/test/org/broadinstitute/sting/utils/QualityUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/QualityUtilsUnitTest.java deleted file mode 100644 index f5c7a14df..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/QualityUtilsUnitTest.java +++ /dev/null @@ -1,188 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils; - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: 3/21/12 - */ - -import org.broadinstitute.sting.BaseTest; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.*; - -/** - * Basic unit test for QualityUtils class - */ -public class QualityUtilsUnitTest extends BaseTest { - final private static double TOLERANCE = 1e-9; - - @BeforeClass - public void init() { - } - - @DataProvider(name = "QualTest") - public Object[][] makeMyDataProvider() { - List tests = new ArrayList(); - - for ( int qual = 0; qual < 255; qual++ ) { - tests.add(new Object[]{(byte)(qual & 0xFF), Math.pow(10.0, ((double)qual)/-10.0)}); - } - - return tests.toArray(new Object[][]{}); - } - - /** - * Example testng test using MyDataProvider - */ - @Test(dataProvider = "QualTest") - public void testMyData(final byte qual, final double errorRate) { - final double trueRate = 1 - errorRate; - - final double actualErrorRate = QualityUtils.qualToErrorProb(qual); - Assert.assertEquals(actualErrorRate, errorRate, TOLERANCE); - final double actualTrueRate = QualityUtils.qualToProb(qual); - Assert.assertEquals(actualTrueRate, trueRate, TOLERANCE); - - // log10 tests - final double actualLog10ErrorRate = QualityUtils.qualToErrorProbLog10(qual); - Assert.assertEquals(actualLog10ErrorRate, Math.log10(errorRate), TOLERANCE); - final double actualLog10TrueRate = QualityUtils.qualToProbLog10(qual); - Assert.assertEquals(actualLog10TrueRate, Math.log10(trueRate), TOLERANCE); - - // test that we can convert our error rates to quals, accounting for boundaries - final int expectedQual = Math.max(Math.min(qual & 0xFF, QualityUtils.MAX_SAM_QUAL_SCORE), 1); - final byte actualQual = QualityUtils.trueProbToQual(trueRate); - Assert.assertEquals(actualQual, expectedQual & 0xFF); - final byte actualQualFromErrorRate = QualityUtils.errorProbToQual(errorRate); - Assert.assertEquals(actualQualFromErrorRate, expectedQual & 0xFF); - - for ( int maxQual = 10; maxQual < QualityUtils.MAX_SAM_QUAL_SCORE; maxQual++ ) { - final byte maxAsByte = (byte)(maxQual & 0xFF); - final byte expectedQual2 = (byte)(Math.max(Math.min(qual & 0xFF, maxQual), 1) & 0xFF); - final byte actualQual2 = QualityUtils.trueProbToQual(trueRate, maxAsByte); - Assert.assertEquals(actualQual2, expectedQual2, "Failed with max " + maxQual); - final byte actualQualFromErrorRate2 = QualityUtils.errorProbToQual(errorRate, maxAsByte); - Assert.assertEquals(actualQualFromErrorRate2, expectedQual2, "Failed with max " + maxQual); - - // test the integer routines - final byte actualQualInt2 = QualityUtils.trueProbToQual(trueRate, maxQual); - Assert.assertEquals(actualQualInt2, expectedQual2, "Failed with max " + maxQual); - final byte actualQualFromErrorRateInt2 = QualityUtils.errorProbToQual(errorRate, maxQual); - Assert.assertEquals(actualQualFromErrorRateInt2, expectedQual2, "Failed with max " + maxQual); - } - } - - @Test - public void testTrueProbWithMinDouble() { - final byte actual = QualityUtils.trueProbToQual(Double.MIN_VALUE); - Assert.assertEquals(actual, 1, "Failed to convert true prob of min double to 1 qual"); - } - - @Test - public void testTrueProbWithVerySmallValue() { - final byte actual = QualityUtils.trueProbToQual(1.7857786272673852E-19); - Assert.assertEquals(actual, 1, "Failed to convert true prob of very small value 1.7857786272673852E-19 to 1 qual"); - } - - @Test - public void testQualCaches() { - Assert.assertEquals(QualityUtils.qualToErrorProb((byte) 20), 0.01, 1e-6); - Assert.assertEquals(QualityUtils.qualToErrorProbLog10((byte) 20), -2.0, 1e-6); - Assert.assertEquals(QualityUtils.qualToProb((byte) 20), 0.99, 1e-6); - Assert.assertEquals(QualityUtils.qualToProbLog10((byte) 20), -0.0043648054, 1e-6); - - Assert.assertEquals(QualityUtils.qualToErrorProb((byte) 30), 0.001, 1e-6); - Assert.assertEquals(QualityUtils.qualToErrorProbLog10((byte) 30), -3.0, 1e-6); - Assert.assertEquals(QualityUtils.qualToProb((byte) 30), 0.999, 1e-6); - Assert.assertEquals(QualityUtils.qualToProbLog10((byte) 30), -0.000434511774, 1e-6); - - Assert.assertEquals(QualityUtils.qualToErrorProb((byte) 40), 0.0001, 1e-6); - Assert.assertEquals(QualityUtils.qualToErrorProbLog10((byte) 40), -4.0, 1e-6); - Assert.assertEquals(QualityUtils.qualToProb((byte) 40), 0.9999, 1e-6); - Assert.assertEquals(QualityUtils.qualToProbLog10((byte) 40), -4.34316198e-5, 1e-6); - } - - @Test() - public void testBoundingDefault() { - for ( int qual = 0; qual < 1000; qual++ ) { - final byte expected = (byte)Math.max(Math.min(qual, QualityUtils.MAX_SAM_QUAL_SCORE), 1); - Assert.assertEquals(QualityUtils.boundQual(qual), expected); - } - } - - @Test() - public void testBoundingWithMax() { - for ( int max = 10; max < 255; max += 50 ) { - for ( int qual = 0; qual < 1000; qual++ ) { - final int expected = Math.max(Math.min(qual, max), 1); - Assert.assertEquals(QualityUtils.boundQual(qual, (byte)(max & 0xFF)) & 0xFF, expected & 0xFF, "qual " + qual + " max " + max); - } - } - } - - @DataProvider(name = "PhredScaleDoubleOps") - public Object[][] makePhredDoubleTest() { - List tests = new ArrayList(); - - tests.add(new Object[]{0.0, -10 * Math.log10(Double.MIN_VALUE)}); - tests.add(new Object[]{1.0, 0.0}); - for ( int pow = 1; pow < 20; pow++ ) { - tests.add(new Object[]{Math.pow(10.0, -1.0 * pow), pow * 10}); - tests.add(new Object[]{Math.pow(10.0, -1.5 * pow), pow * 15}); - } - - return tests.toArray(new Object[][]{}); - } - - @Test() - public void testQualToErrorProbDouble() { - for ( double qual = 3.0; qual < 255.0; qual += 0.1 ) { - final double expected = Math.pow(10.0, qual / -10.0); - Assert.assertEquals(QualityUtils.qualToErrorProb(qual), expected, TOLERANCE, "failed qual->error prob for double qual " + qual); - } - } - - - @Test(dataProvider = "PhredScaleDoubleOps") - public void testPhredScaleDoubleOps(final double errorRate, final double expectedPhredScaled) { - final double actualError = QualityUtils.phredScaleErrorRate(errorRate); - Assert.assertEquals(actualError, expectedPhredScaled, TOLERANCE); - final double trueRate = 1 - errorRate; - final double actualTrue = QualityUtils.phredScaleCorrectRate(trueRate); - if ( trueRate == 1.0 ) { - Assert.assertEquals(actualTrue, QualityUtils.MIN_PHRED_SCALED_QUAL); - } else { - final double tol = errorRate < 1e-10 ? 10.0 : 1e-3; - Assert.assertEquals(actualTrue, expectedPhredScaled, tol); - } - } -} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java deleted file mode 100644 index f92cd4bcf..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java +++ /dev/null @@ -1,129 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils; - -import org.broadinstitute.sting.BaseTest; -import org.testng.Assert; -import org.testng.annotations.Test; - -import java.util.Arrays; -import java.util.List; -import java.util.concurrent.TimeUnit; - -public class SimpleTimerUnitTest extends BaseTest { - private final static String NAME = "unit.test.timer"; - - @Test - public void testSimpleTimer() { - SimpleTimer t = new SimpleTimer(NAME); - Assert.assertEquals(t.getName(), NAME, "Name is not the provided one"); - Assert.assertFalse(t.isRunning(), "Initial state of the timer is running"); - Assert.assertEquals(t.getElapsedTime(), 0.0, "New timer elapsed time should be 0"); - Assert.assertEquals(t.getElapsedTimeNano(), 0l, "New timer elapsed time nano should be 0"); - - t.start(); - Assert.assertTrue(t.isRunning(), "Started timer isn't running"); - Assert.assertTrue(t.getElapsedTime() >= 0.0, "Elapsed time should be >= 0"); - Assert.assertTrue(t.getElapsedTimeNano() >= 0.0, "Elapsed time nano should be >= 0"); - long n1 = t.getElapsedTimeNano(); - double t1 = t.getElapsedTime(); - idleLoop(); // idle loop to wait a tiny bit of time - long n2 = t.getElapsedTimeNano(); - double t2 = t.getElapsedTime(); - Assert.assertTrue(t2 >= t1, "T2 >= T1 for a running time"); - Assert.assertTrue(n2 >= n1, "T2 >= T1 nano for a running time"); - - t.stop(); - Assert.assertFalse(t.isRunning(), "Stopped timer still running"); - long n3 = t.getElapsedTimeNano(); - double t3 = t.getElapsedTime(); - idleLoop(); // idle loop to wait a tiny bit of time - double t4 = t.getElapsedTime(); - long n4 = t.getElapsedTimeNano(); - Assert.assertTrue(t4 == t3, "Elapsed times for two calls of stop timer not the same"); - Assert.assertTrue(n4 == n3, "Elapsed times for two calls of stop timer not the same"); - - t.restart(); - idleLoop(); // idle loop to wait a tiny bit of time - double t5 = t.getElapsedTime(); - long n5 = t.getElapsedTimeNano(); - Assert.assertTrue(t.isRunning(), "Restarted timer should be running"); - idleLoop(); // idle loop to wait a tiny bit of time - double t6 = t.getElapsedTime(); - long n6 = t.getElapsedTimeNano(); - Assert.assertTrue(t5 >= t4, "Restarted timer elapsed time should be after elapsed time preceding the restart"); - Assert.assertTrue(t6 >= t5, "Second elapsed time not after the first in restarted timer"); - Assert.assertTrue(n5 >= n4, "Restarted timer elapsed time nano should be after elapsed time preceding the restart"); - Assert.assertTrue(n6 >= n5, "Second elapsed time nano not after the first in restarted timer"); - - final List secondTimes = Arrays.asList(t1, t2, t3, t4, t5, t6); - final List nanoTimes = Arrays.asList(n1, n2, n3, n4, n5, n6); - for ( int i = 0; i < nanoTimes.size(); i++ ) - Assert.assertEquals( - SimpleTimer.nanoToSecondsAsDouble(nanoTimes.get(i)), - secondTimes.get(i), 1e-1, "Nanosecond and second timer disagree"); - } - - @Test - public void testNanoResolution() { - SimpleTimer t = new SimpleTimer(NAME); - - // test the nanosecond resolution - long n7 = t.currentTimeNano(); - int sum = 0; - for ( int i = 0; i < 100; i++) sum += i; - long n8 = t.currentTimeNano(); - final long delta = n8 - n7; - final long oneMilliInNano = TimeUnit.MILLISECONDS.toNanos(1); - logger.warn("nanoTime before nano operation " + n7); - logger.warn("nanoTime after nano operation of summing 100 ints " + n8 + ", sum = " + sum + " time delta " + delta + " vs. 1 millsecond in nano " + oneMilliInNano); - Assert.assertTrue(n8 > n7, "SimpleTimer doesn't appear to have nanoSecond resolution: n8 " + n8 + " <= n7 " + n7); - Assert.assertTrue(delta < oneMilliInNano, - "SimpleTimer doesn't appear to have nanoSecond resolution: time delta is " + delta + " vs 1 millisecond in nano " + oneMilliInNano); - } - - @Test - public void testMeaningfulTimes() { - SimpleTimer t = new SimpleTimer(NAME); - - t.start(); - for ( int i = 0; i < 100; i++ ) ; - long nano = t.getElapsedTimeNano(); - double secs = t.getElapsedTime(); - - Assert.assertTrue(secs > 0, "Seconds timer doesn't appear to count properly: elapsed time is " + secs); - Assert.assertTrue(secs < 0.01, "Fast operation said to take longer than 10 milliseconds: elapsed time in seconds " + secs); - - Assert.assertTrue(nano > 0, "Nanosecond timer doesn't appear to count properly: elapsed time is " + nano); - final long maxTimeInMicro = 10000; - final long maxTimeInNano = TimeUnit.MICROSECONDS.toNanos(maxTimeInMicro); - Assert.assertTrue(nano < maxTimeInNano, "Fast operation said to take longer than " + maxTimeInMicro + " microseconds: elapsed time in nano " + nano + " micro " + TimeUnit.NANOSECONDS.toMicros(nano)); - } - - private static void idleLoop() { - for ( int i = 0; i < 100000; i++ ) ; // idle loop to wait a tiny bit of time - } -} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/io/IOUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/io/IOUtilsUnitTest.java deleted file mode 100644 index 64a71f060..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/io/IOUtilsUnitTest.java +++ /dev/null @@ -1,326 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.io; - -import org.apache.commons.io.FileUtils; -import org.broadinstitute.sting.BaseTest; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.util.Arrays; -import java.util.List; -import java.util.Random; - -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -public class IOUtilsUnitTest extends BaseTest { - @Test - public void testGoodTempDir() { - IOUtils.checkTempDir(new File("/tmp/queue")); - } - - @Test(expectedExceptions=UserException.BadTmpDir.class) - public void testBadTempDir() { - IOUtils.checkTempDir(new File("/tmp")); - } - - @Test - public void testAbsoluteSubDir() { - File subDir = IOUtils.absolute(new File("."), new File("/path/to/file")); - Assert.assertEquals(subDir, new File("/path/to/file")); - - subDir = IOUtils.absolute(new File("/different/path"), new File("/path/to/file")); - Assert.assertEquals(subDir, new File("/path/to/file")); - - subDir = IOUtils.absolute(new File("/different/path"), new File(".")); - Assert.assertEquals(subDir, new File("/different/path")); - } - - @Test - public void testRelativeSubDir() throws IOException { - File subDir = IOUtils.absolute(new File("."), new File("path/to/file")); - Assert.assertEquals(subDir.getCanonicalFile(), new File("path/to/file").getCanonicalFile()); - - subDir = IOUtils.absolute(new File("/different/path"), new File("path/to/file")); - Assert.assertEquals(subDir, new File("/different/path/path/to/file")); - } - - @Test - public void testDottedSubDir() throws IOException { - File subDir = IOUtils.absolute(new File("."), new File("path/../to/file")); - Assert.assertEquals(subDir.getCanonicalFile(), new File("path/../to/./file").getCanonicalFile()); - - subDir = IOUtils.absolute(new File("."), new File("/path/../to/file")); - Assert.assertEquals(subDir, new File("/path/../to/file")); - - subDir = IOUtils.absolute(new File("/different/../path"), new File("path/to/file")); - Assert.assertEquals(subDir, new File("/different/../path/path/to/file")); - - subDir = IOUtils.absolute(new File("/different/./path"), new File("/path/../to/file")); - Assert.assertEquals(subDir, new File("/path/../to/file")); - } - - @Test - public void testTempDir() { - File tempDir = IOUtils.tempDir("Q-Unit-Test", "", new File("queueTempDirToDelete")); - Assert.assertTrue(tempDir.exists()); - Assert.assertFalse(tempDir.isFile()); - Assert.assertTrue(tempDir.isDirectory()); - boolean deleted = IOUtils.tryDelete(tempDir); - Assert.assertTrue(deleted); - Assert.assertFalse(tempDir.exists()); - } - - @Test - public void testDirLevel() { - File dir = IOUtils.dirLevel(new File("/path/to/directory"), 1); - Assert.assertEquals(dir, new File("/path")); - - dir = IOUtils.dirLevel(new File("/path/to/directory"), 2); - Assert.assertEquals(dir, new File("/path/to")); - - dir = IOUtils.dirLevel(new File("/path/to/directory"), 3); - Assert.assertEquals(dir, new File("/path/to/directory")); - - dir = IOUtils.dirLevel(new File("/path/to/directory"), 4); - Assert.assertEquals(dir, new File("/path/to/directory")); - } - - @Test - public void testAbsolute() { - File dir = IOUtils.absolute(new File("/path/./to/./directory/.")); - Assert.assertEquals(dir, new File("/path/to/directory")); - - dir = IOUtils.absolute(new File("/")); - Assert.assertEquals(dir, new File("/")); - - dir = IOUtils.absolute(new File("/.")); - Assert.assertEquals(dir, new File("/")); - - dir = IOUtils.absolute(new File("/././.")); - Assert.assertEquals(dir, new File("/")); - - dir = IOUtils.absolute(new File("/./directory/.")); - Assert.assertEquals(dir, new File("/directory")); - - dir = IOUtils.absolute(new File("/./directory/./")); - Assert.assertEquals(dir, new File("/directory")); - - dir = IOUtils.absolute(new File("/./directory./")); - Assert.assertEquals(dir, new File("/directory.")); - - dir = IOUtils.absolute(new File("/./.directory/")); - Assert.assertEquals(dir, new File("/.directory")); - } - - @Test - public void testTail() throws IOException { - List lines = Arrays.asList( - "chr18_random 4262 3154410390 50 51", - "chr19_random 301858 3154414752 50 51", - "chr21_random 1679693 3154722662 50 51", - "chr22_random 257318 3156435963 50 51", - "chrX_random 1719168 3156698441 50 51"); - List tail = IOUtils.tail(new File(BaseTest.hg18Reference + ".fai"), 5); - Assert.assertEquals(tail.size(), 5); - for (int i = 0; i < 5; i++) - Assert.assertEquals(tail.get(i), lines.get(i)); - } - - @Test - public void testWriteSystemFile() throws IOException { - File temp = createTempFile("temp.", ".properties"); - try { - IOUtils.writeResource(new Resource("StingText.properties", null), temp); - } finally { - FileUtils.deleteQuietly(temp); - } - } - - @Test - public void testWriteSystemTempFile() throws IOException { - File temp = IOUtils.writeTempResource(new Resource("StingText.properties", null)); - try { - Assert.assertTrue(temp.getName().startsWith("StingText"), "File does not start with 'StingText.': " + temp); - Assert.assertTrue(temp.getName().endsWith(".properties"), "File does not end with '.properties': " + temp); - } finally { - FileUtils.deleteQuietly(temp); - } - } - - @Test(expectedExceptions = IllegalArgumentException.class) - public void testMissingSystemFile() throws IOException { - File temp = createTempFile("temp.", ".properties"); - try { - IOUtils.writeResource(new Resource("MissingStingText.properties", null), temp); - } finally { - FileUtils.deleteQuietly(temp); - } - } - - @Test - public void testWriteRelativeFile() throws IOException { - File temp = createTempFile("temp.", ".properties"); - try { - IOUtils.writeResource(new Resource("/StingText.properties", IOUtils.class), temp); - } finally { - FileUtils.deleteQuietly(temp); - } - } - - @Test - public void testWriteRelativeTempFile() throws IOException { - File temp = IOUtils.writeTempResource(new Resource("/StingText.properties", IOUtils.class)); - try { - Assert.assertTrue(temp.getName().startsWith("StingText"), "File does not start with 'StingText.': " + temp); - Assert.assertTrue(temp.getName().endsWith(".properties"), "File does not end with '.properties': " + temp); - } finally { - FileUtils.deleteQuietly(temp); - } - } - - @Test(expectedExceptions = IllegalArgumentException.class) - public void testMissingRelativeFile() throws IOException { - File temp = createTempFile("temp.", ".properties"); - try { - // Looking for /org/broadinstitute/sting/utils/file/StingText.properties - IOUtils.writeResource(new Resource("StingText.properties", IOUtils.class), temp); - } finally { - FileUtils.deleteQuietly(temp); - } - } - - @Test - public void testResourceProperties() { - Resource resource = new Resource("foo", Resource.class); - Assert.assertEquals(resource.getPath(), "foo"); - Assert.assertEquals(resource.getRelativeClass(), Resource.class); - } - - @Test - public void testIsSpecialFile() { - Assert.assertTrue(IOUtils.isSpecialFile(new File("/dev"))); - Assert.assertTrue(IOUtils.isSpecialFile(new File("/dev/null"))); - Assert.assertTrue(IOUtils.isSpecialFile(new File("/dev/full"))); - Assert.assertTrue(IOUtils.isSpecialFile(new File("/dev/stdout"))); - Assert.assertTrue(IOUtils.isSpecialFile(new File("/dev/stderr"))); - Assert.assertFalse(IOUtils.isSpecialFile(null)); - Assert.assertFalse(IOUtils.isSpecialFile(new File("/home/user/my.file"))); - Assert.assertFalse(IOUtils.isSpecialFile(new File("/devfake/null"))); - } - - @DataProvider( name = "ByteArrayIOTestData") - public Object[][] byteArrayIOTestDataProvider() { - return new Object[][] { - // file size, read buffer size - { 0, 4096 }, - { 1, 4096 }, - { 2000, 4096 }, - { 4095, 4096 }, - { 4096, 4096 }, - { 4097, 4096 }, - { 6000, 4096 }, - { 8191, 4096 }, - { 8192, 4096 }, - { 8193, 4096 }, - { 10000, 4096 } - }; - } - - @Test( dataProvider = "ByteArrayIOTestData" ) - public void testWriteThenReadFileIntoByteArray ( int fileSize, int readBufferSize ) throws Exception { - File tempFile = createTempFile(String.format("testWriteThenReadFileIntoByteArray_%d_%d", fileSize, readBufferSize), "tmp"); - - byte[] dataWritten = getDeterministicRandomData(fileSize); - IOUtils.writeByteArrayToFile(dataWritten, tempFile); - byte[] dataRead = IOUtils.readFileIntoByteArray(tempFile, readBufferSize); - - Assert.assertEquals(dataRead.length, dataWritten.length); - Assert.assertTrue(Arrays.equals(dataRead, dataWritten)); - } - - @Test( dataProvider = "ByteArrayIOTestData" ) - public void testWriteThenReadStreamIntoByteArray ( int fileSize, int readBufferSize ) throws Exception { - File tempFile = createTempFile(String.format("testWriteThenReadStreamIntoByteArray_%d_%d", fileSize, readBufferSize), "tmp"); - - byte[] dataWritten = getDeterministicRandomData(fileSize); - IOUtils.writeByteArrayToStream(dataWritten, new FileOutputStream(tempFile)); - byte[] dataRead = IOUtils.readStreamIntoByteArray(new FileInputStream(tempFile), readBufferSize); - - Assert.assertEquals(dataRead.length, dataWritten.length); - Assert.assertTrue(Arrays.equals(dataRead, dataWritten)); - } - - @Test( expectedExceptions = UserException.CouldNotReadInputFile.class ) - public void testReadNonExistentFileIntoByteArray() { - File nonExistentFile = new File("djfhsdkjghdfk"); - Assert.assertFalse(nonExistentFile.exists()); - - IOUtils.readFileIntoByteArray(nonExistentFile); - } - - @Test( expectedExceptions = ReviewedStingException.class ) - public void testReadNullStreamIntoByteArray() { - IOUtils.readStreamIntoByteArray(null); - } - - @Test( expectedExceptions = ReviewedStingException.class ) - public void testReadStreamIntoByteArrayInvalidBufferSize() throws Exception { - IOUtils.readStreamIntoByteArray(new FileInputStream(createTempFile("testReadStreamIntoByteArrayInvalidBufferSize", "tmp")), - -1); - } - - @Test( expectedExceptions = UserException.CouldNotCreateOutputFile.class ) - public void testWriteByteArrayToUncreatableFile() { - IOUtils.writeByteArrayToFile(new byte[]{0}, new File("/dev/foo/bar")); - } - - @Test( expectedExceptions = ReviewedStingException.class ) - public void testWriteNullByteArrayToFile() { - IOUtils.writeByteArrayToFile(null, createTempFile("testWriteNullByteArrayToFile", "tmp")); - } - - @Test( expectedExceptions = ReviewedStingException.class ) - public void testWriteByteArrayToNullStream() { - IOUtils.writeByteArrayToStream(new byte[]{0}, null); - } - - private byte[] getDeterministicRandomData ( int size ) { - GenomeAnalysisEngine.resetRandomGenerator(); - Random rand = GenomeAnalysisEngine.getRandomGenerator(); - - byte[] randomData = new byte[size]; - rand.nextBytes(randomData); - - return randomData; - } -} diff --git a/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java deleted file mode 100644 index 23a24e180..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java +++ /dev/null @@ -1,1629 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.variant; - -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.variant.variantcontext.*; -import org.testng.Assert; -import org.testng.annotations.BeforeSuite; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.*; - -public class GATKVariantContextUtilsUnitTest extends BaseTest { - private final static boolean DEBUG = false; - - Allele Aref, T, C, G, Cref, ATC, ATCATC; - - @BeforeSuite - public void setup() { - // alleles - Aref = Allele.create("A", true); - Cref = Allele.create("C", true); - T = Allele.create("T"); - C = Allele.create("C"); - G = Allele.create("G"); - ATC = Allele.create("ATC"); - ATCATC = Allele.create("ATCATC"); - } - - private Genotype makeG(String sample, Allele a1, Allele a2, double log10pError, int... pls) { - return new GenotypeBuilder(sample, Arrays.asList(a1, a2)).log10PError(log10pError).PL(pls).make(); - } - - - private Genotype makeG(String sample, Allele a1, Allele a2, double log10pError) { - return new GenotypeBuilder(sample, Arrays.asList(a1, a2)).log10PError(log10pError).make(); - } - - private VariantContext makeVC(String source, List alleles) { - return makeVC(source, alleles, null, null); - } - - private VariantContext makeVC(String source, List alleles, Genotype... g1) { - return makeVC(source, alleles, Arrays.asList(g1)); - } - - private VariantContext makeVC(String source, List alleles, String filter) { - return makeVC(source, alleles, filter.equals(".") ? null : new HashSet(Arrays.asList(filter))); - } - - private VariantContext makeVC(String source, List alleles, Set filters) { - return makeVC(source, alleles, null, filters); - } - - private VariantContext makeVC(String source, List alleles, Collection genotypes) { - return makeVC(source, alleles, genotypes, null); - } - - private VariantContext makeVC(String source, List alleles, Collection genotypes, Set filters) { - int start = 10; - int stop = start; // alleles.contains(ATC) ? start + 3 : start; - return new VariantContextBuilder(source, "1", start, stop, alleles).genotypes(genotypes).filters(filters).make(); - } - - // -------------------------------------------------------------------------------- - // - // Test allele merging - // - // -------------------------------------------------------------------------------- - - private class MergeAllelesTest extends TestDataProvider { - List> inputs; - List expected; - - private MergeAllelesTest(List... arg) { - super(MergeAllelesTest.class); - LinkedList> all = new LinkedList<>(Arrays.asList(arg)); - expected = all.pollLast(); - inputs = all; - } - - public String toString() { - return String.format("MergeAllelesTest input=%s expected=%s", inputs, expected); - } - } - @DataProvider(name = "mergeAlleles") - public Object[][] mergeAllelesData() { - // first, do no harm - new MergeAllelesTest(Arrays.asList(Aref), - Arrays.asList(Aref)); - - new MergeAllelesTest(Arrays.asList(Aref), - Arrays.asList(Aref), - Arrays.asList(Aref)); - - new MergeAllelesTest(Arrays.asList(Aref), - Arrays.asList(Aref, T), - Arrays.asList(Aref, T)); - - new MergeAllelesTest(Arrays.asList(Aref, C), - Arrays.asList(Aref, T), - Arrays.asList(Aref, C, T)); - - new MergeAllelesTest(Arrays.asList(Aref, T), - Arrays.asList(Aref, C), - Arrays.asList(Aref, T, C)); // in order of appearence - - new MergeAllelesTest(Arrays.asList(Aref, C, T), - Arrays.asList(Aref, C), - Arrays.asList(Aref, C, T)); - - new MergeAllelesTest(Arrays.asList(Aref, C, T), Arrays.asList(Aref, C, T)); - new MergeAllelesTest(Arrays.asList(Aref, T, C), Arrays.asList(Aref, T, C)); - - new MergeAllelesTest(Arrays.asList(Aref, T, C), - Arrays.asList(Aref, C), - Arrays.asList(Aref, T, C)); // in order of appearence - - new MergeAllelesTest(Arrays.asList(Aref), - Arrays.asList(Aref, ATC), - Arrays.asList(Aref, ATC)); - - new MergeAllelesTest(Arrays.asList(Aref), - Arrays.asList(Aref, ATC, ATCATC), - Arrays.asList(Aref, ATC, ATCATC)); - - // alleles in the order we see them - new MergeAllelesTest(Arrays.asList(Aref, ATCATC), - Arrays.asList(Aref, ATC, ATCATC), - Arrays.asList(Aref, ATCATC, ATC)); - - // same - new MergeAllelesTest(Arrays.asList(Aref, ATC), - Arrays.asList(Aref, ATCATC), - Arrays.asList(Aref, ATC, ATCATC)); - - return MergeAllelesTest.getTests(MergeAllelesTest.class); - } - - @Test(enabled = !DEBUG, dataProvider = "mergeAlleles") - public void testMergeAlleles(MergeAllelesTest cfg) { - final List inputs = new ArrayList(); - - int i = 0; - for ( final List alleles : cfg.inputs ) { - final String name = "vcf" + ++i; - inputs.add(makeVC(name, alleles)); - } - - final List priority = vcs2priority(inputs); - - final VariantContext merged = GATKVariantContextUtils.simpleMerge( - inputs, priority, - GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, "set", false, false); - - Assert.assertEquals(merged.getAlleles(), cfg.expected); - } - - // -------------------------------------------------------------------------------- - // - // Test rsID merging - // - // -------------------------------------------------------------------------------- - - private class SimpleMergeRSIDTest extends TestDataProvider { - List inputs; - String expected; - - private SimpleMergeRSIDTest(String... arg) { - super(SimpleMergeRSIDTest.class); - LinkedList allStrings = new LinkedList(Arrays.asList(arg)); - expected = allStrings.pollLast(); - inputs = allStrings; - } - - public String toString() { - return String.format("SimpleMergeRSIDTest vc=%s expected=%s", inputs, expected); - } - } - - @DataProvider(name = "simplemergersiddata") - public Object[][] createSimpleMergeRSIDData() { - new SimpleMergeRSIDTest(".", "."); - new SimpleMergeRSIDTest(".", ".", "."); - new SimpleMergeRSIDTest("rs1", "rs1"); - new SimpleMergeRSIDTest("rs1", "rs1", "rs1"); - new SimpleMergeRSIDTest(".", "rs1", "rs1"); - new SimpleMergeRSIDTest("rs1", ".", "rs1"); - new SimpleMergeRSIDTest("rs1", "rs2", "rs1,rs2"); - new SimpleMergeRSIDTest("rs1", "rs2", "rs1", "rs1,rs2"); // duplicates - new SimpleMergeRSIDTest("rs2", "rs1", "rs2,rs1"); - new SimpleMergeRSIDTest("rs2", "rs1", ".", "rs2,rs1"); - new SimpleMergeRSIDTest("rs2", ".", "rs1", "rs2,rs1"); - new SimpleMergeRSIDTest("rs1", ".", ".", "rs1"); - new SimpleMergeRSIDTest("rs1", "rs2", "rs3", "rs1,rs2,rs3"); - - return SimpleMergeRSIDTest.getTests(SimpleMergeRSIDTest.class); - } - - @Test(enabled = !DEBUG, dataProvider = "simplemergersiddata") - public void testRSIDMerge(SimpleMergeRSIDTest cfg) { - VariantContext snpVC1 = makeVC("snpvc1", Arrays.asList(Aref, T)); - final List inputs = new ArrayList(); - - for ( final String id : cfg.inputs ) { - inputs.add(new VariantContextBuilder(snpVC1).id(id).make()); - } - - final VariantContext merged = GATKVariantContextUtils.simpleMerge( - inputs, null, - GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, "set", false, false); - Assert.assertEquals(merged.getID(), cfg.expected); - } - - // -------------------------------------------------------------------------------- - // - // Test filtered merging - // - // -------------------------------------------------------------------------------- - - private class MergeFilteredTest extends TestDataProvider { - List inputs; - VariantContext expected; - String setExpected; - GATKVariantContextUtils.FilteredRecordMergeType type; - - - private MergeFilteredTest(String name, VariantContext input1, VariantContext input2, VariantContext expected, String setExpected) { - this(name, input1, input2, expected, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, setExpected); - } - - private MergeFilteredTest(String name, VariantContext input1, VariantContext input2, VariantContext expected, GATKVariantContextUtils.FilteredRecordMergeType type, String setExpected) { - super(MergeFilteredTest.class, name); - LinkedList all = new LinkedList(Arrays.asList(input1, input2)); - this.expected = expected; - this.type = type; - inputs = all; - this.setExpected = setExpected; - } - - public String toString() { - return String.format("%s input=%s expected=%s", super.toString(), inputs, expected); - } - } - - @DataProvider(name = "mergeFiltered") - public Object[][] mergeFilteredData() { - new MergeFilteredTest("AllPass", - makeVC("1", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), - makeVC("2", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), - makeVC("3", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), - GATKVariantContextUtils.MERGE_INTERSECTION); - - new MergeFilteredTest("noFilters", - makeVC("1", Arrays.asList(Aref, T), "."), - makeVC("2", Arrays.asList(Aref, T), "."), - makeVC("3", Arrays.asList(Aref, T), "."), - GATKVariantContextUtils.MERGE_INTERSECTION); - - new MergeFilteredTest("oneFiltered", - makeVC("1", Arrays.asList(Aref, T), "."), - makeVC("2", Arrays.asList(Aref, T), "FAIL"), - makeVC("3", Arrays.asList(Aref, T), "."), - String.format("1-%s2", GATKVariantContextUtils.MERGE_FILTER_PREFIX)); - - new MergeFilteredTest("onePassOneFail", - makeVC("1", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), - makeVC("2", Arrays.asList(Aref, T), "FAIL"), - makeVC("3", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), - String.format("1-%s2", GATKVariantContextUtils.MERGE_FILTER_PREFIX)); - - new MergeFilteredTest("AllFiltered", - makeVC("1", Arrays.asList(Aref, T), "FAIL"), - makeVC("2", Arrays.asList(Aref, T), "FAIL"), - makeVC("3", Arrays.asList(Aref, T), "FAIL"), - GATKVariantContextUtils.MERGE_FILTER_IN_ALL); - - // test ALL vs. ANY - new MergeFilteredTest("FailOneUnfiltered", - makeVC("1", Arrays.asList(Aref, T), "FAIL"), - makeVC("2", Arrays.asList(Aref, T), "."), - makeVC("3", Arrays.asList(Aref, T), "."), - GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - String.format("%s1-2", GATKVariantContextUtils.MERGE_FILTER_PREFIX)); - - new MergeFilteredTest("OneFailAllUnfilteredArg", - makeVC("1", Arrays.asList(Aref, T), "FAIL"), - makeVC("2", Arrays.asList(Aref, T), "."), - makeVC("3", Arrays.asList(Aref, T), "FAIL"), - GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ALL_UNFILTERED, - String.format("%s1-2", GATKVariantContextUtils.MERGE_FILTER_PREFIX)); - - // test excluding allele in filtered record - new MergeFilteredTest("DontIncludeAlleleOfFilteredRecords", - makeVC("1", Arrays.asList(Aref, T), "."), - makeVC("2", Arrays.asList(Aref, T), "FAIL"), - makeVC("3", Arrays.asList(Aref, T), "."), - String.format("1-%s2", GATKVariantContextUtils.MERGE_FILTER_PREFIX)); - - // promotion of site from unfiltered to PASSES - new MergeFilteredTest("UnfilteredPlusPassIsPass", - makeVC("1", Arrays.asList(Aref, T), "."), - makeVC("2", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), - makeVC("3", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), - GATKVariantContextUtils.MERGE_INTERSECTION); - - new MergeFilteredTest("RefInAll", - makeVC("1", Arrays.asList(Aref), VariantContext.PASSES_FILTERS), - makeVC("2", Arrays.asList(Aref), VariantContext.PASSES_FILTERS), - makeVC("3", Arrays.asList(Aref), VariantContext.PASSES_FILTERS), - GATKVariantContextUtils.MERGE_REF_IN_ALL); - - new MergeFilteredTest("RefInOne", - makeVC("1", Arrays.asList(Aref), VariantContext.PASSES_FILTERS), - makeVC("2", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), - makeVC("3", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), - "2"); - - return MergeFilteredTest.getTests(MergeFilteredTest.class); - } - - @Test(enabled = !DEBUG, dataProvider = "mergeFiltered") - public void testMergeFiltered(MergeFilteredTest cfg) { - final List priority = vcs2priority(cfg.inputs); - final VariantContext merged = GATKVariantContextUtils.simpleMerge( - cfg.inputs, priority, cfg.type, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false); - - // test alleles are equal - Assert.assertEquals(merged.getAlleles(), cfg.expected.getAlleles()); - - // test set field - Assert.assertEquals(merged.getAttribute("set"), cfg.setExpected); - - // test filter field - Assert.assertEquals(merged.getFilters(), cfg.expected.getFilters()); - } - - // -------------------------------------------------------------------------------- - // - // Test genotype merging - // - // -------------------------------------------------------------------------------- - - private class MergeGenotypesTest extends TestDataProvider { - List inputs; - VariantContext expected; - List priority; - - private MergeGenotypesTest(String name, String priority, VariantContext... arg) { - super(MergeGenotypesTest.class, name); - LinkedList all = new LinkedList(Arrays.asList(arg)); - this.expected = all.pollLast(); - inputs = all; - this.priority = Arrays.asList(priority.split(",")); - } - - public String toString() { - return String.format("%s input=%s expected=%s", super.toString(), inputs, expected); - } - } - - @DataProvider(name = "mergeGenotypes") - public Object[][] mergeGenotypesData() { - new MergeGenotypesTest("TakeGenotypeByPriority-1,2", "1,2", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), - makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)), - makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1))); - - new MergeGenotypesTest("TakeGenotypeByPriority-1,2-nocall", "1,2", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, -1)), - makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)), - makeVC("3", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, -1))); - - new MergeGenotypesTest("TakeGenotypeByPriority-2,1", "2,1", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), - makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)), - makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2))); - - new MergeGenotypesTest("NonOverlappingGenotypes", "1,2", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), - makeVC("2", Arrays.asList(Aref, T), makeG("s2", Aref, T, -2)), - makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1), makeG("s2", Aref, T, -2))); - - new MergeGenotypesTest("PreserveNoCall", "1,2", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, -1)), - makeVC("2", Arrays.asList(Aref, T), makeG("s2", Aref, T, -2)), - makeVC("3", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, -1), makeG("s2", Aref, T, -2))); - - new MergeGenotypesTest("PerserveAlleles", "1,2", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), - makeVC("2", Arrays.asList(Aref, C), makeG("s2", Aref, C, -2)), - makeVC("3", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, -1), makeG("s2", Aref, C, -2))); - - new MergeGenotypesTest("TakeGenotypePartialOverlap-1,2", "1,2", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), - makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2), makeG("s3", Aref, T, -3)), - makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1), makeG("s3", Aref, T, -3))); - - new MergeGenotypesTest("TakeGenotypePartialOverlap-2,1", "2,1", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), - makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2), makeG("s3", Aref, T, -3)), - makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2), makeG("s3", Aref, T, -3))); - - // - // merging genothpes with PLs - // - - // first, do no harm - new MergeGenotypesTest("OrderedPLs", "1", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1, 1, 2, 3)), - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1, 1, 2, 3))); - - // first, do no harm - new MergeGenotypesTest("OrderedPLs-3Alleles", "1", - makeVC("1", Arrays.asList(Aref, C, T), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6)), - makeVC("1", Arrays.asList(Aref, C, T), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6))); - - // first, do no harm - new MergeGenotypesTest("OrderedPLs-3Alleles-2", "1", - makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6)), - makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6))); - - // first, do no harm - new MergeGenotypesTest("OrderedPLs-3Alleles-2", "1", - makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6)), - makeVC("1", Arrays.asList(Aref, T, C), makeG("s2", Aref, C, -1, 1, 2, 3, 4, 5, 6)), - makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6), makeG("s2", Aref, C, -1, 1, 2, 3, 4, 5, 6))); - - new MergeGenotypesTest("TakeGenotypePartialOverlapWithPLs-2,1", "2,1", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1,5,0,3)), - makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2,4,0,2), makeG("s3", Aref, T, -3,3,0,2)), - makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2,4,0,2), makeG("s3", Aref, T, -3,3,0,2))); - - new MergeGenotypesTest("TakeGenotypePartialOverlapWithPLs-1,2", "1,2", - makeVC("1", Arrays.asList(Aref,ATC), makeG("s1", Aref, ATC, -1,5,0,3)), - makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2,4,0,2), makeG("s3", Aref, T, -3,3,0,2)), - // no likelihoods on result since type changes to mixed multiallelic - makeVC("3", Arrays.asList(Aref, ATC, T), makeG("s1", Aref, ATC, -1), makeG("s3", Aref, T, -3))); - - new MergeGenotypesTest("MultipleSamplePLsDifferentOrder", "1,2", - makeVC("1", Arrays.asList(Aref, C, T), makeG("s1", Aref, C, -1, 1, 2, 3, 4, 5, 6)), - makeVC("2", Arrays.asList(Aref, T, C), makeG("s2", Aref, T, -2, 6, 5, 4, 3, 2, 1)), - // no likelihoods on result since type changes to mixed multiallelic - makeVC("3", Arrays.asList(Aref, C, T), makeG("s1", Aref, C, -1), makeG("s2", Aref, T, -2))); - - return MergeGenotypesTest.getTests(MergeGenotypesTest.class); - } - - @Test(enabled = !DEBUG, dataProvider = "mergeGenotypes") - public void testMergeGenotypes(MergeGenotypesTest cfg) { - final VariantContext merged = GATKVariantContextUtils.simpleMerge( - cfg.inputs, cfg.priority, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false); - - // test alleles are equal - Assert.assertEquals(merged.getAlleles(), cfg.expected.getAlleles()); - - // test genotypes - assertGenotypesAreMostlyEqual(merged.getGenotypes(), cfg.expected.getGenotypes()); - } - - // necessary to not overload equals for genotypes - private void assertGenotypesAreMostlyEqual(GenotypesContext actual, GenotypesContext expected) { - if (actual == expected) { - return; - } - - if (actual == null || expected == null) { - Assert.fail("Maps not equal: expected: " + expected + " and actual: " + actual); - } - - if (actual.size() != expected.size()) { - Assert.fail("Maps do not have the same size:" + actual.size() + " != " + expected.size()); - } - - for (Genotype value : actual) { - Genotype expectedValue = expected.get(value.getSampleName()); - - Assert.assertEquals(value.getAlleles(), expectedValue.getAlleles(), "Alleles in Genotype aren't equal"); - Assert.assertEquals(value.getGQ(), expectedValue.getGQ(), "GQ values aren't equal"); - Assert.assertEquals(value.hasLikelihoods(), expectedValue.hasLikelihoods(), "Either both have likelihoods or both not"); - if ( value.hasLikelihoods() ) - Assert.assertEquals(value.getLikelihoods().getAsVector(), expectedValue.getLikelihoods().getAsVector(), "Genotype likelihoods aren't equal"); - } - } - - @Test(enabled = !DEBUG) - public void testMergeGenotypesUniquify() { - final VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)); - final VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)); - - final VariantContext merged = GATKVariantContextUtils.simpleMerge( - Arrays.asList(vc1, vc2), null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.UNIQUIFY, false, false, "set", false, false); - - // test genotypes - Assert.assertEquals(merged.getSampleNames(), new HashSet<>(Arrays.asList("s1.1", "s1.2"))); - } - -// TODO: remove after testing -// @Test(expectedExceptions = IllegalStateException.class) -// public void testMergeGenotypesRequireUnique() { -// final VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)); -// final VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)); -// -// final VariantContext merged = VariantContextUtils.simpleMerge( -// Arrays.asList(vc1, vc2), null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, -// VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE, false, false, "set", false, false, false); -// } - - // -------------------------------------------------------------------------------- - // - // Misc. tests - // - // -------------------------------------------------------------------------------- - - @Test(enabled = !DEBUG) - public void testAnnotationSet() { - for ( final boolean annotate : Arrays.asList(true, false)) { - for ( final String set : Arrays.asList("set", "combine", "x")) { - final List priority = Arrays.asList("1", "2"); - VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS); - VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS); - - final VariantContext merged = GATKVariantContextUtils.simpleMerge( - Arrays.asList(vc1, vc2), priority, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, annotate, false, set, false, false); - - if ( annotate ) - Assert.assertEquals(merged.getAttribute(set), GATKVariantContextUtils.MERGE_INTERSECTION); - else - Assert.assertFalse(merged.hasAttribute(set)); - } - } - } - - private static final List vcs2priority(final Collection vcs) { - final List priority = new ArrayList<>(); - - for ( final VariantContext vc : vcs ) { - priority.add(vc.getSource()); - } - - return priority; - } - - // -------------------------------------------------------------------------------- - // - // basic allele clipping test - // - // -------------------------------------------------------------------------------- - - private class ReverseClippingPositionTestProvider extends TestDataProvider { - final String ref; - final List alleles = new ArrayList(); - final int expectedClip; - - private ReverseClippingPositionTestProvider(final int expectedClip, final String ref, final String... alleles) { - super(ReverseClippingPositionTestProvider.class); - this.ref = ref; - for ( final String allele : alleles ) - this.alleles.add(Allele.create(allele)); - this.expectedClip = expectedClip; - } - - @Override - public String toString() { - return String.format("ref=%s allele=%s reverse clip %d", ref, alleles, expectedClip); - } - } - - @DataProvider(name = "ReverseClippingPositionTestProvider") - public Object[][] makeReverseClippingPositionTestProvider() { - // pair clipping - new ReverseClippingPositionTestProvider(0, "ATT", "CCG"); - new ReverseClippingPositionTestProvider(1, "ATT", "CCT"); - new ReverseClippingPositionTestProvider(2, "ATT", "CTT"); - new ReverseClippingPositionTestProvider(2, "ATT", "ATT"); // cannot completely clip allele - - // triplets - new ReverseClippingPositionTestProvider(0, "ATT", "CTT", "CGG"); - new ReverseClippingPositionTestProvider(1, "ATT", "CTT", "CGT"); // the T can go - new ReverseClippingPositionTestProvider(2, "ATT", "CTT", "CTT"); // both Ts can go - - return ReverseClippingPositionTestProvider.getTests(ReverseClippingPositionTestProvider.class); - } - - @Test(enabled = !DEBUG, dataProvider = "ReverseClippingPositionTestProvider") - public void testReverseClippingPositionTestProvider(ReverseClippingPositionTestProvider cfg) { - int result = GATKVariantContextUtils.computeReverseClipping(cfg.alleles, cfg.ref.getBytes()); - Assert.assertEquals(result, cfg.expectedClip); - } - - - // -------------------------------------------------------------------------------- - // - // test splitting into bi-allelics - // - // -------------------------------------------------------------------------------- - - @DataProvider(name = "SplitBiallelics") - public Object[][] makeSplitBiallelics() throws CloneNotSupportedException { - List tests = new ArrayList(); - - final VariantContextBuilder root = new VariantContextBuilder("x", "20", 10, 10, Arrays.asList(Aref, C)); - - // biallelic -> biallelic - tests.add(new Object[]{root.make(), Arrays.asList(root.make())}); - - // monos -> monos - root.alleles(Arrays.asList(Aref)); - tests.add(new Object[]{root.make(), Arrays.asList(root.make())}); - - root.alleles(Arrays.asList(Aref, C, T)); - tests.add(new Object[]{root.make(), - Arrays.asList( - root.alleles(Arrays.asList(Aref, C)).make(), - root.alleles(Arrays.asList(Aref, T)).make())}); - - root.alleles(Arrays.asList(Aref, C, T, G)); - tests.add(new Object[]{root.make(), - Arrays.asList( - root.alleles(Arrays.asList(Aref, C)).make(), - root.alleles(Arrays.asList(Aref, T)).make(), - root.alleles(Arrays.asList(Aref, G)).make())}); - - final Allele C = Allele.create("C"); - final Allele CA = Allele.create("CA"); - final Allele CAA = Allele.create("CAA"); - final Allele CAAAA = Allele.create("CAAAA"); - final Allele CAAAAA = Allele.create("CAAAAA"); - final Allele Cref = Allele.create("C", true); - final Allele CAref = Allele.create("CA", true); - final Allele CAAref = Allele.create("CAA", true); - final Allele CAAAref = Allele.create("CAAA", true); - - root.alleles(Arrays.asList(Cref, CA, CAA)); - tests.add(new Object[]{root.make(), - Arrays.asList( - root.alleles(Arrays.asList(Cref, CA)).make(), - root.alleles(Arrays.asList(Cref, CAA)).make())}); - - root.alleles(Arrays.asList(CAAref, C, CA)).stop(12); - tests.add(new Object[]{root.make(), - Arrays.asList( - root.alleles(Arrays.asList(CAAref, C)).make(), - root.alleles(Arrays.asList(CAref, C)).stop(11).make())}); - - root.alleles(Arrays.asList(CAAAref, C, CA, CAA)).stop(13); - tests.add(new Object[]{root.make(), - Arrays.asList( - root.alleles(Arrays.asList(CAAAref, C)).make(), - root.alleles(Arrays.asList(CAAref, C)).stop(12).make(), - root.alleles(Arrays.asList(CAref, C)).stop(11).make())}); - - root.alleles(Arrays.asList(CAAAref, CAAAAA, CAAAA, CAA, C)).stop(13); - tests.add(new Object[]{root.make(), - Arrays.asList( - root.alleles(Arrays.asList(Cref, CAA)).stop(10).make(), - root.alleles(Arrays.asList(Cref, CA)).stop(10).make(), - root.alleles(Arrays.asList(CAref, C)).stop(11).make(), - root.alleles(Arrays.asList(CAAAref, C)).stop(13).make())}); - - final Allele threeCopies = Allele.create("GTTTTATTTTATTTTA", true); - final Allele twoCopies = Allele.create("GTTTTATTTTA", true); - final Allele zeroCopies = Allele.create("G", false); - final Allele oneCopies = Allele.create("GTTTTA", false); - tests.add(new Object[]{root.alleles(Arrays.asList(threeCopies, zeroCopies, oneCopies)).stop(25).make(), - Arrays.asList( - root.alleles(Arrays.asList(threeCopies, zeroCopies)).stop(25).make(), - root.alleles(Arrays.asList(twoCopies, zeroCopies)).stop(20).make())}); - - return tests.toArray(new Object[][]{}); - } - - @Test(enabled = !DEBUG, dataProvider = "SplitBiallelics") - public void testSplitBiallelicsNoGenotypes(final VariantContext vc, final List expectedBiallelics) { - final List biallelics = GATKVariantContextUtils.splitVariantContextToBiallelics(vc); - Assert.assertEquals(biallelics.size(), expectedBiallelics.size()); - for ( int i = 0; i < biallelics.size(); i++ ) { - final VariantContext actual = biallelics.get(i); - final VariantContext expected = expectedBiallelics.get(i); - assertVariantContextsAreEqual(actual, expected); - } - } - - @Test(enabled = !DEBUG, dataProvider = "SplitBiallelics", dependsOnMethods = "testSplitBiallelicsNoGenotypes") - public void testSplitBiallelicsGenotypes(final VariantContext vc, final List expectedBiallelics) { - final List genotypes = new ArrayList(); - - int sampleI = 0; - for ( final List alleles : Utils.makePermutations(vc.getAlleles(), 2, true) ) { - genotypes.add(GenotypeBuilder.create("sample" + sampleI++, alleles)); - } - genotypes.add(GenotypeBuilder.createMissing("missing", 2)); - - final VariantContext vcWithGenotypes = new VariantContextBuilder(vc).genotypes(genotypes).make(); - - final List biallelics = GATKVariantContextUtils.splitVariantContextToBiallelics(vcWithGenotypes); - for ( int i = 0; i < biallelics.size(); i++ ) { - final VariantContext actual = biallelics.get(i); - Assert.assertEquals(actual.getNSamples(), vcWithGenotypes.getNSamples()); // not dropping any samples - - for ( final Genotype inputGenotype : genotypes ) { - final Genotype actualGenotype = actual.getGenotype(inputGenotype.getSampleName()); - Assert.assertNotNull(actualGenotype); - if ( ! vc.isVariant() || vc.isBiallelic() ) - Assert.assertEquals(actualGenotype, vcWithGenotypes.getGenotype(inputGenotype.getSampleName())); - else - Assert.assertTrue(actualGenotype.isNoCall()); - } - } - } - - // -------------------------------------------------------------------------------- - // - // Test repeats - // - // -------------------------------------------------------------------------------- - - private class RepeatDetectorTest extends TestDataProvider { - String ref; - boolean isTrueRepeat; - VariantContext vc; - - private RepeatDetectorTest(boolean isTrueRepeat, String ref, String refAlleleString, String ... altAlleleStrings) { - super(RepeatDetectorTest.class); - this.isTrueRepeat = isTrueRepeat; - this.ref = ref; - - List alleles = new LinkedList(); - final Allele refAllele = Allele.create(refAlleleString, true); - alleles.add(refAllele); - for ( final String altString: altAlleleStrings) { - final Allele alt = Allele.create(altString, false); - alleles.add(alt); - } - - VariantContextBuilder builder = new VariantContextBuilder("test", "chr1", 1, refAllele.length(), alleles); - this.vc = builder.make(); - } - - public String toString() { - return String.format("%s refBases=%s trueRepeat=%b vc=%s", super.toString(), ref, isTrueRepeat, vc); - } - } - - @DataProvider(name = "RepeatDetectorTest") - public Object[][] makeRepeatDetectorTest() { - new RepeatDetectorTest(true, "NAAC", "N", "NA"); - new RepeatDetectorTest(true, "NAAC", "NA", "N"); - new RepeatDetectorTest(false, "NAAC", "NAA", "N"); - new RepeatDetectorTest(false, "NAAC", "N", "NC"); - new RepeatDetectorTest(false, "AAC", "A", "C"); - - // running out of ref bases => false - new RepeatDetectorTest(false, "NAAC", "N", "NCAGTA"); - - // complex repeats - new RepeatDetectorTest(true, "NATATATC", "N", "NAT"); - new RepeatDetectorTest(true, "NATATATC", "N", "NATA"); - new RepeatDetectorTest(true, "NATATATC", "N", "NATAT"); - new RepeatDetectorTest(true, "NATATATC", "NAT", "N"); - new RepeatDetectorTest(false, "NATATATC", "NATA", "N"); - new RepeatDetectorTest(false, "NATATATC", "NATAT", "N"); - - // multi-allelic - new RepeatDetectorTest(true, "NATATATC", "N", "NAT", "NATAT"); - new RepeatDetectorTest(true, "NATATATC", "N", "NAT", "NATA"); - new RepeatDetectorTest(true, "NATATATC", "NAT", "N", "NATAT"); - new RepeatDetectorTest(true, "NATATATC", "NAT", "N", "NATA"); // two As - new RepeatDetectorTest(false, "NATATATC", "NAT", "N", "NATC"); // false - new RepeatDetectorTest(false, "NATATATC", "NAT", "N", "NCC"); // false - new RepeatDetectorTest(false, "NATATATC", "NAT", "NATAT", "NCC"); // false - - return RepeatDetectorTest.getTests(RepeatDetectorTest.class); - } - - @Test(enabled = !DEBUG, dataProvider = "RepeatDetectorTest") - public void testRepeatDetectorTest(RepeatDetectorTest cfg) { - - // test alleles are equal - Assert.assertEquals(GATKVariantContextUtils.isTandemRepeat(cfg.vc, cfg.ref.getBytes()), cfg.isTrueRepeat); - } - - @Test(enabled = !DEBUG) - public void testRepeatAllele() { - Allele nullR = Allele.create("A", true); - Allele nullA = Allele.create("A", false); - Allele atc = Allele.create("AATC", false); - Allele atcatc = Allele.create("AATCATC", false); - Allele ccccR = Allele.create("ACCCC", true); - Allele cc = Allele.create("ACC", false); - Allele cccccc = Allele.create("ACCCCCC", false); - Allele gagaR = Allele.create("AGAGA", true); - Allele gagagaga = Allele.create("AGAGAGAGA", false); - - // - / ATC [ref] from 20-22 - String delLoc = "chr1"; - int delLocStart = 20; - int delLocStop = 22; - - // - [ref] / ATC from 20-20 - String insLoc = "chr1"; - int insLocStart = 20; - int insLocStop = 20; - - Pair,byte[]> result; - byte[] refBytes = "TATCATCATCGGA".getBytes(); - - Assert.assertEquals(GATKVariantContextUtils.findNumberofRepetitions("ATG".getBytes(), "ATGATGATGATG".getBytes(), true),4); - Assert.assertEquals(GATKVariantContextUtils.findNumberofRepetitions("G".getBytes(), "ATGATGATGATG".getBytes(), true),0); - Assert.assertEquals(GATKVariantContextUtils.findNumberofRepetitions("T".getBytes(), "T".getBytes(), true),1); - Assert.assertEquals(GATKVariantContextUtils.findNumberofRepetitions("AT".getBytes(), "ATGATGATCATG".getBytes(), true),1); - Assert.assertEquals(GATKVariantContextUtils.findNumberofRepetitions("CCC".getBytes(), "CCCCCCCC".getBytes(), true),2); - - Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("ATG".getBytes()),3); - Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("AAA".getBytes()),1); - Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("CACACAC".getBytes()),7); - Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("CACACA".getBytes()),2); - Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("CATGCATG".getBytes()),4); - Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("AATAATA".getBytes()),7); - - - // A*,ATC, context = ATC ATC ATC : (ATC)3 -> (ATC)4 - VariantContext vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStop, Arrays.asList(nullR,atc)).make(); - result = GATKVariantContextUtils.getNumTandemRepeatUnits(vc, refBytes); - Assert.assertEquals(result.getFirst().toArray()[0],3); - Assert.assertEquals(result.getFirst().toArray()[1],4); - Assert.assertEquals(result.getSecond().length,3); - - // ATC*,A,ATCATC - vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStart+3, Arrays.asList(Allele.create("AATC", true),nullA,atcatc)).make(); - result = GATKVariantContextUtils.getNumTandemRepeatUnits(vc, refBytes); - Assert.assertEquals(result.getFirst().toArray()[0],3); - Assert.assertEquals(result.getFirst().toArray()[1],2); - Assert.assertEquals(result.getFirst().toArray()[2],4); - Assert.assertEquals(result.getSecond().length,3); - - // simple non-tandem deletion: CCCC*, - - refBytes = "TCCCCCCCCATG".getBytes(); - vc = new VariantContextBuilder("foo", delLoc, 10, 14, Arrays.asList(ccccR,nullA)).make(); - result = GATKVariantContextUtils.getNumTandemRepeatUnits(vc, refBytes); - Assert.assertEquals(result.getFirst().toArray()[0],8); - Assert.assertEquals(result.getFirst().toArray()[1],4); - Assert.assertEquals(result.getSecond().length,1); - - // CCCC*,CC,-,CCCCCC, context = CCC: (C)7 -> (C)5,(C)3,(C)9 - refBytes = "TCCCCCCCAGAGAGAG".getBytes(); - vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStart+4, Arrays.asList(ccccR,cc, nullA,cccccc)).make(); - result = GATKVariantContextUtils.getNumTandemRepeatUnits(vc, refBytes); - Assert.assertEquals(result.getFirst().toArray()[0],7); - Assert.assertEquals(result.getFirst().toArray()[1],5); - Assert.assertEquals(result.getFirst().toArray()[2],3); - Assert.assertEquals(result.getFirst().toArray()[3],9); - Assert.assertEquals(result.getSecond().length,1); - - // GAGA*,-,GAGAGAGA - refBytes = "TGAGAGAGAGATTT".getBytes(); - vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStart+4, Arrays.asList(gagaR, nullA,gagagaga)).make(); - result = GATKVariantContextUtils.getNumTandemRepeatUnits(vc, refBytes); - Assert.assertEquals(result.getFirst().toArray()[0],5); - Assert.assertEquals(result.getFirst().toArray()[1],3); - Assert.assertEquals(result.getFirst().toArray()[2],7); - Assert.assertEquals(result.getSecond().length,2); - - } - - // -------------------------------------------------------------------------------- - // - // test forward clipping - // - // -------------------------------------------------------------------------------- - - @DataProvider(name = "ForwardClippingData") - public Object[][] makeForwardClippingData() { - List tests = new ArrayList(); - - // this functionality can be adapted to provide input data for whatever you might want in your data - tests.add(new Object[]{Arrays.asList("A"), -1}); - tests.add(new Object[]{Arrays.asList(""), -1}); - tests.add(new Object[]{Arrays.asList("A", "C"), -1}); - tests.add(new Object[]{Arrays.asList("AC", "C"), -1}); - tests.add(new Object[]{Arrays.asList("A", "G"), -1}); - tests.add(new Object[]{Arrays.asList("A", "T"), -1}); - tests.add(new Object[]{Arrays.asList("GT", "CA"), -1}); - tests.add(new Object[]{Arrays.asList("GT", "CT"), -1}); - tests.add(new Object[]{Arrays.asList("ACC", "AC"), 0}); - tests.add(new Object[]{Arrays.asList("ACGC", "ACG"), 1}); - tests.add(new Object[]{Arrays.asList("ACGC", "ACG"), 1}); - tests.add(new Object[]{Arrays.asList("ACGC", "ACGA"), 2}); - tests.add(new Object[]{Arrays.asList("ACGC", "AGC"), 0}); - tests.add(new Object[]{Arrays.asList("A", ""), -1}); - for ( int len = 0; len < 50; len++ ) - tests.add(new Object[]{Arrays.asList("A" + new String(Utils.dupBytes((byte)'C', len)), "C"), -1}); - - tests.add(new Object[]{Arrays.asList("A", "T", "C"), -1}); - tests.add(new Object[]{Arrays.asList("AT", "AC", "AG"), 0}); - tests.add(new Object[]{Arrays.asList("AT", "AC", "A"), -1}); - tests.add(new Object[]{Arrays.asList("AT", "AC", "ACG"), 0}); - tests.add(new Object[]{Arrays.asList("AC", "AC", "ACG"), 0}); - tests.add(new Object[]{Arrays.asList("AC", "ACT", "ACG"), 0}); - tests.add(new Object[]{Arrays.asList("ACG", "ACGT", "ACGTA"), 1}); - tests.add(new Object[]{Arrays.asList("ACG", "ACGT", "ACGCA"), 1}); - - return tests.toArray(new Object[][]{}); - } - - @Test(enabled = !DEBUG, dataProvider = "ForwardClippingData") - public void testForwardClipping(final List alleleStrings, final int expectedClip) { - final List alleles = new LinkedList(); - for ( final String alleleString : alleleStrings ) - alleles.add(Allele.create(alleleString)); - - for ( final List myAlleles : Utils.makePermutations(alleles, alleles.size(), false)) { - final int actual = GATKVariantContextUtils.computeForwardClipping(myAlleles); - Assert.assertEquals(actual, expectedClip); - } - } - - @DataProvider(name = "ClipAlleleTest") - public Object[][] makeClipAlleleTest() { - List tests = new ArrayList(); - - // this functionality can be adapted to provide input data for whatever you might want in your data - tests.add(new Object[]{Arrays.asList("ACC", "AC"), Arrays.asList("AC", "A"), 0}); - tests.add(new Object[]{Arrays.asList("ACGC", "ACG"), Arrays.asList("GC", "G"), 2}); - tests.add(new Object[]{Arrays.asList("ACGC", "ACGA"), Arrays.asList("C", "A"), 3}); - tests.add(new Object[]{Arrays.asList("ACGC", "AGC"), Arrays.asList("AC", "A"), 0}); - tests.add(new Object[]{Arrays.asList("AT", "AC", "AG"), Arrays.asList("T", "C", "G"), 1}); - tests.add(new Object[]{Arrays.asList("AT", "AC", "ACG"), Arrays.asList("T", "C", "CG"), 1}); - tests.add(new Object[]{Arrays.asList("AC", "ACT", "ACG"), Arrays.asList("C", "CT", "CG"), 1}); - tests.add(new Object[]{Arrays.asList("ACG", "ACGT", "ACGTA"), Arrays.asList("G", "GT", "GTA"), 2}); - tests.add(new Object[]{Arrays.asList("ACG", "ACGT", "ACGCA"), Arrays.asList("G", "GT", "GCA"), 2}); - - // trims from left and right - tests.add(new Object[]{Arrays.asList("ACGTT", "ACCTT"), Arrays.asList("G", "C"), 2}); - tests.add(new Object[]{Arrays.asList("ACGTT", "ACCCTT"), Arrays.asList("G", "CC"), 2}); - tests.add(new Object[]{Arrays.asList("ACGTT", "ACGCTT"), Arrays.asList("G", "GC"), 2}); - - return tests.toArray(new Object[][]{}); - } - - @Test(enabled = !DEBUG, dataProvider = "ClipAlleleTest") - public void testClipAlleles(final List alleleStrings, final List expected, final int numLeftClipped) { - final int start = 10; - final VariantContext unclipped = GATKVariantContextUtils.makeFromAlleles("test", "20", start, alleleStrings); - final VariantContext clipped = GATKVariantContextUtils.trimAlleles(unclipped, true, true); - - Assert.assertEquals(clipped.getStart(), unclipped.getStart() + numLeftClipped); - for ( int i = 0; i < unclipped.getAlleles().size(); i++ ) { - final Allele trimmed = clipped.getAlleles().get(i); - Assert.assertEquals(trimmed.getBaseString(), expected.get(i)); - } - } - - // -------------------------------------------------------------------------------- - // - // test primitive allele splitting - // - // -------------------------------------------------------------------------------- - - @DataProvider(name = "PrimitiveAlleleSplittingData") - public Object[][] makePrimitiveAlleleSplittingData() { - List tests = new ArrayList<>(); - - // no split - tests.add(new Object[]{"A", "C", 0, null}); - tests.add(new Object[]{"A", "AC", 0, null}); - tests.add(new Object[]{"AC", "A", 0, null}); - - // one split - tests.add(new Object[]{"ACA", "GCA", 1, Arrays.asList(0)}); - tests.add(new Object[]{"ACA", "AGA", 1, Arrays.asList(1)}); - tests.add(new Object[]{"ACA", "ACG", 1, Arrays.asList(2)}); - - // two splits - tests.add(new Object[]{"ACA", "GGA", 2, Arrays.asList(0, 1)}); - tests.add(new Object[]{"ACA", "GCG", 2, Arrays.asList(0, 2)}); - tests.add(new Object[]{"ACA", "AGG", 2, Arrays.asList(1, 2)}); - - // three splits - tests.add(new Object[]{"ACA", "GGG", 3, Arrays.asList(0, 1, 2)}); - - return tests.toArray(new Object[][]{}); - } - - @Test(enabled = !DEBUG, dataProvider = "PrimitiveAlleleSplittingData") - public void testPrimitiveAlleleSplitting(final String ref, final String alt, final int expectedSplit, final List variantPositions) { - - final int start = 10; - final VariantContext vc = GATKVariantContextUtils.makeFromAlleles("test", "20", start, Arrays.asList(ref, alt)); - - final List result = GATKVariantContextUtils.splitIntoPrimitiveAlleles(vc); - - if ( expectedSplit > 0 ) { - Assert.assertEquals(result.size(), expectedSplit); - for ( int i = 0; i < variantPositions.size(); i++ ) { - Assert.assertEquals(result.get(i).getStart(), start + variantPositions.get(i)); - } - } else { - Assert.assertEquals(result.size(), 1); - Assert.assertEquals(vc, result.get(0)); - } - } - - // -------------------------------------------------------------------------------- - // - // test allele remapping - // - // -------------------------------------------------------------------------------- - - @DataProvider(name = "AlleleRemappingData") - public Object[][] makeAlleleRemappingData() { - List tests = new ArrayList<>(); - - final Allele originalBase1 = Allele.create((byte)'A'); - final Allele originalBase2 = Allele.create((byte)'T'); - - for ( final byte base1 : BaseUtils.BASES ) { - for ( final byte base2 : BaseUtils.BASES ) { - for ( final int numGenotypes : Arrays.asList(0, 1, 2, 5) ) { - Map map = new HashMap<>(2); - map.put(originalBase1, Allele.create(base1)); - map.put(originalBase2, Allele.create(base2)); - - tests.add(new Object[]{map, numGenotypes}); - } - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(enabled = !DEBUG, dataProvider = "AlleleRemappingData") - public void testAlleleRemapping(final Map alleleMap, final int numGenotypes) { - - final GATKVariantContextUtils.AlleleMapper alleleMapper = new GATKVariantContextUtils.AlleleMapper(alleleMap); - - final GenotypesContext originalGC = createGenotypesContext(numGenotypes, new ArrayList(alleleMap.keySet())); - - final GenotypesContext remappedGC = GATKVariantContextUtils.updateGenotypesWithMappedAlleles(originalGC, alleleMapper); - - for ( int i = 0; i < numGenotypes; i++ ) { - final Genotype originalG = originalGC.get(String.format("%d", i)); - final Genotype remappedG = remappedGC.get(String.format("%d", i)); - - Assert.assertEquals(originalG.getAlleles().size(), remappedG.getAlleles().size()); - for ( int j = 0; j < originalG.getAlleles().size(); j++ ) - Assert.assertEquals(remappedG.getAllele(j), alleleMap.get(originalG.getAllele(j))); - } - } - - private static GenotypesContext createGenotypesContext(final int numGenotypes, final List alleles) { - GenomeAnalysisEngine.resetRandomGenerator(); - final Random random = GenomeAnalysisEngine.getRandomGenerator(); - - final GenotypesContext gc = GenotypesContext.create(); - for ( int i = 0; i < numGenotypes; i++ ) { - // choose alleles at random - final List myAlleles = new ArrayList(); - myAlleles.add(alleles.get(random.nextInt(2))); - myAlleles.add(alleles.get(random.nextInt(2))); - - final Genotype g = new GenotypeBuilder(String.format("%d", i)).alleles(myAlleles).make(); - gc.add(g); - } - - return gc; - } - - // -------------------------------------------------------------------------------- - // - // Test subsetDiploidAlleles - // - // -------------------------------------------------------------------------------- - - @DataProvider(name = "subsetDiploidAllelesData") - public Object[][] makesubsetDiploidAllelesData() { - List tests = new ArrayList<>(); - - final Allele A = Allele.create("A", true); - final Allele C = Allele.create("C"); - final Allele G = Allele.create("G"); - - final List AA = Arrays.asList(A,A); - final List AC = Arrays.asList(A,C); - final List CC = Arrays.asList(C,C); - final List AG = Arrays.asList(A,G); - final List CG = Arrays.asList(C,G); - final List GG = Arrays.asList(G,G); - final List ACG = Arrays.asList(A,C,G); - - final VariantContext vcBase = new VariantContextBuilder("test", "20", 10, 10, AC).make(); - - final double[] homRefPL = MathUtils.normalizeFromRealSpace(new double[]{0.9, 0.09, 0.01}); - final double[] hetPL = MathUtils.normalizeFromRealSpace(new double[]{0.09, 0.9, 0.01}); - final double[] homVarPL = MathUtils.normalizeFromRealSpace(new double[]{0.01, 0.09, 0.9}); - final double[] uninformative = new double[]{0, 0, 0}; - - final Genotype base = new GenotypeBuilder("NA12878").DP(10).GQ(50).make(); - - // make sure we don't screw up the simple case - final Genotype aaGT = new GenotypeBuilder(base).alleles(AA).AD(new int[]{10,2}).PL(homRefPL).GQ(8).make(); - final Genotype acGT = new GenotypeBuilder(base).alleles(AC).AD(new int[]{10,2}).PL(hetPL).GQ(8).make(); - final Genotype ccGT = new GenotypeBuilder(base).alleles(CC).AD(new int[]{10,2}).PL(homVarPL).GQ(8).make(); - - tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(aaGT).make(), AC, Arrays.asList(new GenotypeBuilder(aaGT).noAD().make())}); - tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(acGT).make(), AC, Arrays.asList(new GenotypeBuilder(acGT).noAD().make())}); - tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(ccGT).make(), AC, Arrays.asList(new GenotypeBuilder(ccGT).noAD().make())}); - - // uninformative test case - final Genotype uninformativeGT = new GenotypeBuilder(base).alleles(CC).noAD().PL(uninformative).GQ(0).make(); - final Genotype emptyGT = new GenotypeBuilder(base).alleles(GATKVariantContextUtils.NO_CALL_ALLELES).noAD().noPL().noGQ().make(); - tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(uninformativeGT).make(), AC, Arrays.asList(emptyGT)}); - - // actually subsetting down from multiple alt values - final double[] homRef3AllelesPL = new double[]{0, -10, -20, -30, -40, -50}; - final double[] hetRefC3AllelesPL = new double[]{-10, 0, -20, -30, -40, -50}; - final double[] homC3AllelesPL = new double[]{-20, -10, 0, -30, -40, -50}; - final double[] hetRefG3AllelesPL = new double[]{-20, -10, -30, 0, -40, -50}; - final double[] hetCG3AllelesPL = new double[]{-20, -10, -30, -40, 0, -50}; // AA, AC, CC, AG, CG, GG - final double[] homG3AllelesPL = new double[]{-20, -10, -30, -40, -50, 0}; // AA, AC, CC, AG, CG, GG - tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).noAD().PL(homRef3AllelesPL).make()).make(), - AC, - Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{0, -10, -20}).noAD().GQ(100).make())}); - - tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).noAD().PL(hetRefC3AllelesPL).make()).make(), - AC, - Arrays.asList(new GenotypeBuilder(base).alleles(AC).PL(new double[]{-10, 0, -20}).noAD().GQ(100).make())}); - - tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).noAD().PL(homC3AllelesPL).make()).make(), - AC, - Arrays.asList(new GenotypeBuilder(base).alleles(CC).PL(new double[]{-20, -10, 0}).noAD().GQ(100).make())}); - tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).noAD().PL(hetRefG3AllelesPL).make()).make(), - AG, - Arrays.asList(new GenotypeBuilder(base).alleles(AG).PL(new double[]{-20, 0, -50}).noAD().GQ(200).make())}); - - // wow, scary -- bad output but discussed with Eric and we think this is the only thing that can be done - tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).noAD().PL(hetCG3AllelesPL).make()).make(), - AG, - Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{0, -20, -30}).noAD().GQ(200).make())}); - - tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).noAD().PL(homG3AllelesPL).make()).make(), - AG, - Arrays.asList(new GenotypeBuilder(base).alleles(GG).PL(new double[]{-20, -40, 0}).noAD().GQ(200).make())}); - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "subsetDiploidAllelesData") - public void testsubsetDiploidAllelesData(final VariantContext inputVC, - final List allelesToUse, - final List expectedGenotypes) { - final GenotypesContext actual = GATKVariantContextUtils.subsetDiploidAlleles(inputVC, allelesToUse, GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN); - - Assert.assertEquals(actual.size(), expectedGenotypes.size()); - for ( final Genotype expected : expectedGenotypes ) { - final Genotype actualGT = actual.get(expected.getSampleName()); - Assert.assertNotNull(actualGT); - assertGenotypesAreEqual(actualGT, expected); - } - } - - @DataProvider(name = "UpdateGenotypeAfterSubsettingData") - public Object[][] makeUpdateGenotypeAfterSubsettingData() { - List tests = new ArrayList(); - - final Allele A = Allele.create("A", true); - final Allele C = Allele.create("C"); - final Allele G = Allele.create("G"); - - final List AA = Arrays.asList(A,A); - final List AC = Arrays.asList(A,C); - final List CC = Arrays.asList(C,C); - final List AG = Arrays.asList(A,G); - final List CG = Arrays.asList(C,G); - final List GG = Arrays.asList(G,G); - final List ACG = Arrays.asList(A,C,G); - final List> allSubsetAlleles = Arrays.asList(AC,AG,ACG); - - final double[] homRefPL = new double[]{0.9, 0.09, 0.01}; - final double[] hetPL = new double[]{0.09, 0.9, 0.01}; - final double[] homVarPL = new double[]{0.01, 0.09, 0.9}; - final double[] uninformative = new double[]{0.33, 0.33, 0.33}; - final List allPLs = Arrays.asList(homRefPL, hetPL, homVarPL, uninformative); - - for ( final List alleles : allSubsetAlleles ) { - for ( final double[] pls : allPLs ) { - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL, pls, AA, alleles, GATKVariantContextUtils.NO_CALL_ALLELES}); - } - } - - for ( final List originalGT : Arrays.asList(AA, AC, CC, AG, CG, GG) ) { - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, homRefPL, originalGT, AC, AA}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, hetPL, originalGT, AC, AC}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, homVarPL, originalGT, AC, CC}); -// tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, uninformative, AA, AC, GATKVariantContextUtils.NO_CALL_ALLELES}); - } - - for ( final double[] pls : allPLs ) { - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AA, AC, AA}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AC, AC, AC}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CC, AC, CC}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CG, AC, AC}); - - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AA, AG, AA}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AC, AG, AA}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CC, AG, AA}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CG, AG, AG}); - - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AA, ACG, AA}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AC, ACG, AC}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CC, ACG, CC}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AG, ACG, AG}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CG, ACG, CG}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, GG, ACG, GG}); - } - - return tests.toArray(new Object[][]{}); - } - - @Test(enabled = !DEBUG, dataProvider = "UpdateGenotypeAfterSubsettingData") - public void testUpdateGenotypeAfterSubsetting(final GATKVariantContextUtils.GenotypeAssignmentMethod mode, - final double[] likelihoods, - final List originalGT, - final List allelesToUse, - final List expectedAlleles) { - final GenotypeBuilder gb = new GenotypeBuilder("test"); - final double[] log10Likelhoods = MathUtils.normalizeFromLog10(likelihoods, true, false); - GATKVariantContextUtils.updateGenotypeAfterSubsetting(originalGT, gb, mode, log10Likelhoods, allelesToUse); - final Genotype g = gb.make(); - Assert.assertEquals(new HashSet<>(g.getAlleles()), new HashSet<>(expectedAlleles)); - } - - @Test(enabled = !DEBUG) - public void testSubsetToRef() { - final Map tests = new LinkedHashMap<>(); - - for ( final List alleles : Arrays.asList(Arrays.asList(Aref), Arrays.asList(C), Arrays.asList(Aref, C), Arrays.asList(Aref, C, C) ) ) { - for ( final String name : Arrays.asList("test1", "test2") ) { - final GenotypeBuilder builder = new GenotypeBuilder(name, alleles); - builder.DP(10); - builder.GQ(30); - builder.AD(alleles.size() == 1 ? new int[]{1} : (alleles.size() == 2 ? new int[]{1, 2} : new int[]{1, 2, 3})); - builder.PL(alleles.size() == 1 ? new int[]{1} : (alleles.size() == 2 ? new int[]{1,2} : new int[]{1,2,3})); - final List refs = Collections.nCopies(alleles.size(), Aref); - tests.put(builder.make(), builder.alleles(refs).noAD().noPL().make()); - } - } - - for ( final int n : Arrays.asList(1, 2, 3) ) { - for ( final List genotypes : Utils.makePermutations(new ArrayList<>(tests.keySet()), n, false) ) { - final VariantContext vc = new VariantContextBuilder("test", "20", 1, 1, Arrays.asList(Aref, C)).genotypes(genotypes).make(); - final GenotypesContext gc = GATKVariantContextUtils.subsetToRefOnly(vc, 2); - - Assert.assertEquals(gc.size(), genotypes.size()); - for ( int i = 0; i < genotypes.size(); i++ ) { -// logger.warn("Testing " + genotypes.get(i) + " => " + gc.get(i) + " " + tests.get(genotypes.get(i))); - assertGenotypesAreEqual(gc.get(i), tests.get(genotypes.get(i))); - } - } - } - } - - // -------------------------------------------------------------------------------- - // - // Test updatePLsAndAD - // - // -------------------------------------------------------------------------------- - - @DataProvider(name = "updatePLsAndADData") - public Object[][] makeUpdatePLsAndADData() { - List tests = new ArrayList<>(); - - final Allele A = Allele.create("A", true); - final Allele C = Allele.create("C"); - final Allele G = Allele.create("G"); - - final List AA = Arrays.asList(A,A); - final List AC = Arrays.asList(A,C); - final List CC = Arrays.asList(C,C); - final List AG = Arrays.asList(A,G); - final List CG = Arrays.asList(C,G); - final List GG = Arrays.asList(G,G); - final List ACG = Arrays.asList(A,C,G); - - final VariantContext vcBase = new VariantContextBuilder("test", "20", 10, 10, AC).make(); - - final double[] homRefPL = MathUtils.normalizeFromRealSpace(new double[]{0.9, 0.09, 0.01}); - final double[] hetPL = MathUtils.normalizeFromRealSpace(new double[]{0.09, 0.9, 0.01}); - final double[] homVarPL = MathUtils.normalizeFromRealSpace(new double[]{0.01, 0.09, 0.9}); - final double[] uninformative = new double[]{0, 0, 0}; - - final Genotype base = new GenotypeBuilder("NA12878").DP(10).GQ(100).make(); - - // make sure we don't screw up the simple case where no selection happens - final Genotype aaGT = new GenotypeBuilder(base).alleles(AA).AD(new int[]{10,2}).PL(homRefPL).GQ(8).make(); - final Genotype acGT = new GenotypeBuilder(base).alleles(AC).AD(new int[]{10,2}).PL(hetPL).GQ(8).make(); - final Genotype ccGT = new GenotypeBuilder(base).alleles(CC).AD(new int[]{10,2}).PL(homVarPL).GQ(8).make(); - - tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(aaGT).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), Arrays.asList(new GenotypeBuilder(aaGT).make())}); - tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(acGT).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), Arrays.asList(new GenotypeBuilder(acGT).make())}); - tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(ccGT).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), Arrays.asList(new GenotypeBuilder(ccGT).make())}); - - // uninformative test cases - final Genotype uninformativeGT = new GenotypeBuilder(base).alleles(CC).noAD().PL(uninformative).GQ(0).make(); - tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(uninformativeGT).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), Arrays.asList(uninformativeGT)}); - final Genotype emptyGT = new GenotypeBuilder(base).alleles(GATKVariantContextUtils.NO_CALL_ALLELES).noAD().noPL().noGQ().make(); - tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(emptyGT).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), Arrays.asList(emptyGT)}); - - // actually subsetting down from multiple alt values - final double[] homRef3AllelesPL = new double[]{0, -10, -20, -30, -40, -50}; - final double[] hetRefC3AllelesPL = new double[]{-10, 0, -20, -30, -40, -50}; - final double[] homC3AllelesPL = new double[]{-20, -10, 0, -30, -40, -50}; - final double[] hetRefG3AllelesPL = new double[]{-20, -10, -30, 0, -40, -50}; - final double[] hetCG3AllelesPL = new double[]{-20, -10, -30, -40, 0, -50}; // AA, AC, CC, AG, CG, GG - final double[] homG3AllelesPL = new double[]{-20, -10, -30, -40, -50, 0}; // AA, AC, CC, AG, CG, GG - - final int[] homRef3AllelesAD = new int[]{20, 0, 1}; - final int[] hetRefC3AllelesAD = new int[]{10, 10, 1}; - final int[] homC3AllelesAD = new int[]{0, 20, 1}; - final int[] hetRefG3AllelesAD = new int[]{10, 0, 11}; - final int[] hetCG3AllelesAD = new int[]{0, 12, 11}; // AA, AC, CC, AG, CG, GG - final int[] homG3AllelesAD = new int[]{0, 1, 21}; // AA, AC, CC, AG, CG, GG - - tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(homRef3AllelesAD).PL(homRef3AllelesPL).make()).make(), - new VariantContextBuilder(vcBase).alleles(AC).make(), - Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{0, -10, -20}).AD(new int[]{20, 0}).GQ(100).make())}); - - tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(hetRefC3AllelesAD).PL(hetRefC3AllelesPL).make()).make(), - new VariantContextBuilder(vcBase).alleles(AC).make(), - Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{-10, 0, -20}).AD(new int[]{10, 10}).GQ(100).make())}); - - tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(homC3AllelesAD).PL(homC3AllelesPL).make()).make(), - new VariantContextBuilder(vcBase).alleles(AC).make(), - Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{-20, -10, 0}).AD(new int[]{0, 20}).GQ(100).make())}); - tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(hetRefG3AllelesAD).PL(hetRefG3AllelesPL).make()).make(), - new VariantContextBuilder(vcBase).alleles(AG).make(), - Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{-20, 0, -50}).AD(new int[]{10, 11}).GQ(100).make())}); - - tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(hetCG3AllelesAD).PL(hetCG3AllelesPL).make()).make(), - new VariantContextBuilder(vcBase).alleles(AG).make(), - Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{0, -20, -30}).AD(new int[]{0, 11}).GQ(100).make())}); - - tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(homG3AllelesAD).PL(homG3AllelesPL).make()).make(), - new VariantContextBuilder(vcBase).alleles(AG).make(), - Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{-20, -40, 0}).AD(new int[]{0, 21}).GQ(100).make())}); - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "updatePLsAndADData") - public void testUpdatePLsAndADData(final VariantContext originalVC, - final VariantContext selectedVC, - final List expectedGenotypes) { - final VariantContext selectedVCwithGTs = new VariantContextBuilder(selectedVC).genotypes(originalVC.getGenotypes()).make(); - final GenotypesContext actual = GATKVariantContextUtils.updatePLsAndAD(selectedVCwithGTs, originalVC); - - Assert.assertEquals(actual.size(), expectedGenotypes.size()); - for ( final Genotype expected : expectedGenotypes ) { - final Genotype actualGT = actual.get(expected.getSampleName()); - Assert.assertNotNull(actualGT); - assertGenotypesAreEqual(actualGT, expected); - } - } - - // -------------------------------------------------------------------------------- - // - // Test methods for merging reference confidence VCs - // - // -------------------------------------------------------------------------------- - - @DataProvider(name = "generatePLsData") - public Object[][] makeGeneratePLsData() { - final List tests = new ArrayList<>(); - - for ( int originalAlleles = 2; originalAlleles <= 5; originalAlleles++ ) { - for ( int swapPosition1 = 0; swapPosition1 < originalAlleles; swapPosition1++ ) { - for ( int swapPosition2 = swapPosition1+1; swapPosition2 < originalAlleles; swapPosition2++ ) { - final int[] indexes = new int[originalAlleles]; - for ( int i = 0; i < originalAlleles; i++ ) - indexes[i] = i; - indexes[swapPosition1] = swapPosition2; - indexes[swapPosition2] = swapPosition1; - tests.add(new Object[]{originalAlleles, indexes}); - } - } - } - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "generatePLsData") - public void testGeneratePLs(final int numOriginalAlleles, final int[] indexOrdering) { - - final int numLikelihoods = GenotypeLikelihoods.numLikelihoods(numOriginalAlleles, 2); - final int[] PLs = new int[numLikelihoods]; - for ( int i = 0; i < numLikelihoods; i++ ) - PLs[i] = i; - - final List alleles = new ArrayList<>(numOriginalAlleles); - alleles.add(Allele.create("A", true)); - for ( int i = 1; i < numOriginalAlleles; i++ ) - alleles.add(Allele.create(Utils.dupString('A', i + 1), false)); - final Genotype genotype = new GenotypeBuilder("foo", alleles).PL(PLs).make(); - - final int[] newPLs = GATKVariantContextUtils.generatePLs(genotype, indexOrdering); - - Assert.assertEquals(newPLs.length, numLikelihoods); - - final int[] expectedPLs = new int[numLikelihoods]; - for ( int i = 0; i < numOriginalAlleles; i++ ) { - for ( int j = i; j < numOriginalAlleles; j++ ) { - final int index = GenotypeLikelihoods.calculatePLindex(i, j); - final int value = GATKVariantContextUtils.calculatePLindexFromUnorderedIndexes(indexOrdering[i], indexOrdering[j]); - expectedPLs[index] = value; - } - } - - for ( int i = 0; i < numLikelihoods; i++ ) { - Assert.assertEquals(newPLs[i], expectedPLs[i]); - } - } - - @Test(expectedExceptions = IllegalArgumentException.class) - public void testGetIndexesOfRelevantAllelesWithNoALT() { - - final List alleles1 = new ArrayList<>(1); - alleles1.add(Allele.create("A", true)); - final List alleles2 = new ArrayList<>(1); - alleles2.add(Allele.create("A", true)); - GATKVariantContextUtils.getIndexesOfRelevantAlleles(alleles1, alleles2); - Assert.fail("We should have thrown an exception because the allele was not present"); - } - - @DataProvider(name = "getIndexesOfRelevantAllelesData") - public Object[][] makeGetIndexesOfRelevantAllelesData() { - final int totalAlleles = 5; - final List alleles = new ArrayList<>(totalAlleles); - alleles.add(Allele.create("A", true)); - for ( int i = 1; i < totalAlleles; i++ ) - alleles.add(Allele.create(Utils.dupString('A', i + 1), false)); - - final List tests = new ArrayList<>(); - - for ( int alleleIndex = 0; alleleIndex < totalAlleles; alleleIndex++ ) { - tests.add(new Object[]{alleleIndex, alleles}); - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "getIndexesOfRelevantAllelesData") - public void testGetIndexesOfRelevantAlleles(final int allelesIndex, final List allAlleles) { - final List myAlleles = new ArrayList<>(3); - - // always add the reference and alleles - myAlleles.add(allAlleles.get(0)); - myAlleles.add(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - // optionally add another alternate allele - if ( allelesIndex > 0 ) - myAlleles.add(allAlleles.get(allelesIndex)); - - final int[] indexes = GATKVariantContextUtils.getIndexesOfRelevantAlleles(myAlleles, allAlleles); - - Assert.assertEquals(indexes.length, allAlleles.size()); - - for ( int i = 0; i < allAlleles.size(); i++ ) { - if ( i == 0 ) - Assert.assertEquals(indexes[i], 0); // ref should always match - else if ( i == allelesIndex ) - Assert.assertEquals(indexes[i], 2); // allele - else - Assert.assertEquals(indexes[i], 1); // - } - } - - @DataProvider(name = "referenceConfidenceMergeData") - public Object[][] makeReferenceConfidenceMergeData() { - final List tests = new ArrayList<>(); - - final int start = 10; - final GenomeLoc loc = new UnvalidatingGenomeLoc("20", 0, start, start); - final VariantContext VCbase = new VariantContextBuilder("test", "20", start, start, Arrays.asList(Aref)).make(); - final VariantContext VCprevBase = new VariantContextBuilder("test", "20", start-1, start-1, Arrays.asList(Aref)).make(); - - final int[] standardPLs = new int[]{30, 20, 10, 71, 72, 73}; - final int[] reorderedSecondAllelePLs = new int[]{30, 71, 73, 20, 72, 10}; - - final List A_ALT = Arrays.asList(Aref, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - final Genotype gA_ALT = new GenotypeBuilder("A").PL(new int[]{0, 100, 1000}).make(); - final VariantContext vcA_ALT = new VariantContextBuilder(VCbase).alleles(A_ALT).genotypes(gA_ALT).make(); - final Allele AAref = Allele.create("AA", true); - final List AA_ALT = Arrays.asList(AAref, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - final Genotype gAA_ALT = new GenotypeBuilder("AA").PL(new int[]{0, 80, 800}).make(); - final VariantContext vcAA_ALT = new VariantContextBuilder(VCprevBase).alleles(AA_ALT).genotypes(gAA_ALT).make(); - final List A_C = Arrays.asList(Aref, C); - final Genotype gA_C = new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10}).make(); - final List A_C_ALT = Arrays.asList(Aref, C, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - final Genotype gA_C_ALT = new GenotypeBuilder("A_C").PL(standardPLs).make(); - final VariantContext vcA_C_ALT = new VariantContextBuilder(VCbase).alleles(A_C_ALT).genotypes(gA_C_ALT).make(); - final List A_G_ALT = Arrays.asList(Aref, G, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - final Genotype gA_G_ALT = new GenotypeBuilder("A_G").PL(standardPLs).make(); - final VariantContext vcA_G_ALT = new VariantContextBuilder(VCbase).alleles(A_G_ALT).genotypes(gA_G_ALT).make(); - final List A_C_G = Arrays.asList(Aref, C, G); - final Genotype gA_C_G = new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 20, 10, 30}).make(); - final List A_C_G_ALT = Arrays.asList(Aref, C, G, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - final Genotype gA_C_G_ALT = new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 20, 10, 30, 71, 72, 73, 74}).make(); - final VariantContext vcA_C_G_ALT = new VariantContextBuilder(VCbase).alleles(A_C_G_ALT).genotypes(gA_C_G_ALT).make(); - final List A_ATC_ALT = Arrays.asList(Aref, ATC, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - final Genotype gA_ATC_ALT = new GenotypeBuilder("A_ATC").PL(standardPLs).make(); - final VariantContext vcA_ATC_ALT = new VariantContextBuilder(VCbase).alleles(A_ATC_ALT).genotypes(gA_ATC_ALT).make(); - final Allele A = Allele.create("A", false); - final List AA_A_ALT = Arrays.asList(AAref, A, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - final Genotype gAA_A_ALT = new GenotypeBuilder("AA_A").PL(standardPLs).make(); - final VariantContext vcAA_A_ALT = new VariantContextBuilder(VCprevBase).alleles(AA_A_ALT).genotypes(gAA_A_ALT).make(); - - // first test the case of a single record - tests.add(new Object[]{Arrays.asList(vcA_C_ALT), - loc, false, - new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C).make()}); - - // now, test pairs: - // a SNP with another SNP - tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcA_G_ALT), - loc, false, - new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(gA_C_ALT, new GenotypeBuilder("A_G").PL(reorderedSecondAllelePLs).make()).make()}); - // a SNP with an indel - tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcA_ATC_ALT), - loc, false, - new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, ATC)).genotypes(gA_C_ALT, new GenotypeBuilder("A_ATC").PL(reorderedSecondAllelePLs).make()).make()}); - // a SNP with 2 SNPs - tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcA_C_G_ALT), - loc, false, - new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(gA_C_ALT, gA_C_G).make()}); - // a SNP with a ref record - tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcA_ALT), - loc, false, - new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, gA_ALT).make()}); - - // spanning records: - // a SNP with a spanning ref record - tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcAA_ALT), - loc, false, - new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, gAA_ALT).make()}); - // a SNP with a spanning deletion - tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcAA_A_ALT), - loc, false, - new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73}).make()).make()}); - - // combination of all - tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcA_G_ALT, vcA_ATC_ALT, vcA_C_G_ALT, vcA_ALT, vcAA_ALT, vcAA_A_ALT), - loc, false, - new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, ATC, G)).genotypes(new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10, 71, 72, 73, 71, 72, 73, 73}).make(), - new GenotypeBuilder("A_G").PL(new int[]{30, 71, 73, 71, 73, 73, 20, 72, 72, 10}).make(), - new GenotypeBuilder("A_ATC").PL(new int[]{30, 71, 73, 20, 72, 10, 71, 73, 72, 73}).make(), - new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 71, 72, 74, 20, 10, 73, 30}).make(), - new GenotypeBuilder("A").PL(new int[]{0, 100, 1000, 100, 1000, 1000, 100, 1000, 1000, 1000}).make(), - new GenotypeBuilder("AA").PL(new int[]{0, 80, 800, 80, 800, 800, 80, 800, 800, 800}).make(), - new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73, 71, 73, 73, 71, 73, 73, 73}).make()).make()}); - - // just spanning ref contexts, trying both instances where we want/do not want ref-only contexts - tests.add(new Object[]{Arrays.asList(vcAA_ALT), - loc, false, - null}); - tests.add(new Object[]{Arrays.asList(vcAA_ALT), - loc, true, - new VariantContextBuilder(VCbase).alleles(Arrays.asList(Allele.create("A", true))).genotypes(new GenotypeBuilder("AA").PL(new int[]{0}).make()).make()}); - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "referenceConfidenceMergeData") - public void testReferenceConfidenceMerge(final List toMerge, final GenomeLoc loc, final boolean returnSiteEvenIfMonomorphic, final VariantContext expectedResult) { - final VariantContext result = GATKVariantContextUtils.referenceConfidenceMerge(toMerge, loc, returnSiteEvenIfMonomorphic ? (byte)'A' : null); - if ( result == null ) { - Assert.assertTrue(expectedResult == null); - return; - } - Assert.assertEquals(result.getAlleles(), expectedResult.getAlleles()); - Assert.assertEquals(result.getNSamples(), expectedResult.getNSamples()); - for ( final Genotype expectedGenotype : expectedResult.getGenotypes() ) { - Assert.assertTrue(result.hasGenotype(expectedGenotype.getSampleName()), "Missing " + expectedGenotype.getSampleName()); - // use string comparisons to test equality for now - Assert.assertEquals(result.getGenotype(expectedGenotype.getSampleName()).toString(), expectedGenotype.toString()); - } - } -} \ No newline at end of file diff --git a/public/package-tests/pom.xml b/public/package-tests/pom.xml new file mode 100644 index 000000000..817cfecdb --- /dev/null +++ b/public/package-tests/pom.xml @@ -0,0 +1,203 @@ + + + 4.0.0 + + + + + org.broadinstitute.sting + sting-root + 2.8-SNAPSHOT + ../sting-root + + + sting-package-tests + pom + Sting Package Tests + + + ${project.basedir}/../.. + true + true + true + true + true + ${project.build.directory}/failsafe-reports/failsafe-summary.xml + + + + + + ${project.groupId} + ${sting.packagetests.artifactId} + ${project.version} + + + + + com.sun + tools + + + + com.google.code.cofoja + cofoja + + + + + ${project.groupId} + gatk-framework + ${project.version} + test-jar + test + + + + * + * + + + + + + + org.testng + testng + test + + + + com.google.caliper + caliper + test + + + + + ${sting.packagetests.basedir}/target + + + + + unittests + + false + + unittests.profile.enabled + true + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + + ${sting.packagetests.basedir} + ${sting.packagetests.basedir} + ${project.build.outputDirectory}/ignored_by_package_test + ${sting.packagetests.testClasses} + + + + unit-tests + + test + + + ${sting.packageunittests.skipped} + + + + + + + + + + integrationtests + + false + + integrationtests.profile.enabled + true + + + + + + + org.apache.maven.plugins + maven-failsafe-plugin + + ${sting.packagetests.basedir} + ${sting.packagetests.basedir} + ${project.build.outputDirectory}/ignored_by_package_test + ${sting.packagetests.testClasses} + ${failsafe.summaryFile} + + + + integration-tests + + verify + + + + ${sting.packageintegrationtests.skipped} + + + + pipeline-tests + + verify + + + + ${sting.packagepipelinetests.skipped} + + + + large-scale-tests + + verify + + + + ${sting.packagelargescaletests.skipped} + + + + knowledge-base-tests + + verify + + + + ${sting.packageknowledgebasetests.skipped} + + + + + + + + + + + diff --git a/public/packages/Aligner.xml b/public/packages/Aligner.xml deleted file mode 100644 index 031dfacfd..000000000 --- a/public/packages/Aligner.xml +++ /dev/null @@ -1,9 +0,0 @@ - - - - - - - - - diff --git a/public/packages/CreatePackager.xsl b/public/packages/CreatePackager.xsl deleted file mode 100644 index a89b6bb35..000000000 --- a/public/packages/CreatePackager.xsl +++ /dev/null @@ -1,220 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/public/packages/GATKEngine.xml b/public/packages/GATKEngine.xml deleted file mode 100644 index 08d2e1c2c..000000000 --- a/public/packages/GATKEngine.xml +++ /dev/null @@ -1,62 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/public/packages/GenomeAnalysisTK.xml b/public/packages/GenomeAnalysisTK.xml deleted file mode 100644 index e95c992b6..000000000 --- a/public/packages/GenomeAnalysisTK.xml +++ /dev/null @@ -1,40 +0,0 @@ - - - - - - - - - - - - - - - - - - diff --git a/public/packages/PicardPrivate.xml b/public/packages/PicardPrivate.xml deleted file mode 100644 index d898a5d07..000000000 --- a/public/packages/PicardPrivate.xml +++ /dev/null @@ -1,10 +0,0 @@ - - - - - - - - - - diff --git a/public/packages/Queue.xml b/public/packages/Queue.xml deleted file mode 100644 index 621a549d5..000000000 --- a/public/packages/Queue.xml +++ /dev/null @@ -1,41 +0,0 @@ - - - - - - - - - - - - - - - - - - - diff --git a/public/packages/QueueEngine.xml b/public/packages/QueueEngine.xml deleted file mode 100644 index af3e20219..000000000 --- a/public/packages/QueueEngine.xml +++ /dev/null @@ -1,78 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/public/pom.xml b/public/pom.xml new file mode 100644 index 000000000..89d49997c --- /dev/null +++ b/public/pom.xml @@ -0,0 +1,47 @@ + + + 4.0.0 + + + org.broadinstitute.sting + sting-root + 2.8-SNAPSHOT + sting-root + + + sting-public + pom + Sting Public + + + sting-root + gsalib + sting-utils + gatk-framework + gatk-package + external-example + + + + + ${project.basedir}/.. + + + + + + queue + + + !disable.queue + + + + gatk-queue-extgen + queue-framework + queue-package + + + + + diff --git a/public/queue-framework/pom.xml b/public/queue-framework/pom.xml new file mode 100644 index 000000000..670750fd0 --- /dev/null +++ b/public/queue-framework/pom.xml @@ -0,0 +1,270 @@ + + + 4.0.0 + + + org.broadinstitute.sting + sting-aggregator + 2.8-SNAPSHOT + ../.. + + + queue-framework + jar + Queue Framework + + + ${project.basedir}/../.. + ${project.build.directory}/generated-sources/gatk-extensions + false + queue-package + + + + + ${project.groupId} + gatk-framework + ${project.version} + + + org.scala-lang + scala-compiler + + + log4j + log4j + + + net.sf.jgrapht + jgrapht + + + org.apache.commons + commons-email + + + javax.mail + mail + + + + ${project.groupId} + gatk-queue-extgen + ${project.version} + runtime + + + + + ${project.groupId} + gatk-framework + ${project.version} + test-jar + test + + + + org.testng + testng + test + + + + + + + + org.codehaus.mojo + exec-maven-plugin + + + generate-gatk-extensions + + exec + + generate-sources + + ${sting.generate-gatk-extensions.skipped} + java + + -classpath + + org.broadinstitute.sting.queue.extensions.gatk.GATKExtensionsGenerator + -l + WARN + -outDir + ${gatk.extensions.sources} + + + + + + + org.codehaus.mojo + build-helper-maven-plugin + + + add-gatk-extensions + + add-source + + generate-sources + + + ${gatk.extensions.sources} + + + + + + + org.apache.maven.plugins + maven-assembly-plugin + + + example-resources + ${sting.generate-resources.phase} + + + + + org.scala-tools + maven-scala-plugin + + + com.pyx4j + maven-junction-plugin + + + link-public-qscript + process-test-resources + + + unlink-public-qscript + clean + + + + + org.apache.maven.plugins + maven-resources-plugin + + + copy-resource-bundle-log4j + prepare-package + + + + + org.apache.maven.plugins + maven-javadoc-plugin + + + extract-resource-bundle + prepare-package + + + + + org.apache.maven.plugins + maven-invoker-plugin + + + package-unittests + + + package-integrationtests + + + package-largescaletests + + + package-knowledgebasetests + + + package-pipelinetests + + + + + + + + + protected + + + ${basedir}/../../protected/gatk-protected/pom.xml + + + + + ${project.groupId} + gatk-protected + ${project.version} + true + + + + + private + + + ${basedir}/../../private/gatk-private/pom.xml + + + + + ${project.groupId} + gatk-private + ${project.version} + true + + + + + + + com.pyx4j + maven-junction-plugin + + + link-private-testdata + process-test-resources + + + unlink-private-testdata + clean + + + link-private-qscript + process-test-resources + + + unlink-private-qscript + clean + + + + + + + + + diff --git a/public/queue-framework/src/main/assembly/example-resources.xml b/public/queue-framework/src/main/assembly/example-resources.xml new file mode 100644 index 000000000..7d4ec43ef --- /dev/null +++ b/public/queue-framework/src/main/assembly/example-resources.xml @@ -0,0 +1,20 @@ + + example-resources + + tar.bz2 + + false + + + src/main/qscripts/org/broadinstitute/sting/queue/qscripts/examples + . + + ExampleCountReads.scala + ExampleCountLoci.scala + ExampleUnifiedGenotyper.scala + ExampleReadFilter.scala + ExampleCustomWalker.scala + + + + diff --git a/public/java/src/org/broadinstitute/sting/queue/QueueVersion.java b/public/queue-framework/src/main/java/org/broadinstitute/sting/queue/QueueVersion.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/queue/QueueVersion.java rename to public/queue-framework/src/main/java/org/broadinstitute/sting/queue/QueueVersion.java diff --git a/public/java/src/org/broadinstitute/sting/queue/package-info.java b/public/queue-framework/src/main/java/org/broadinstitute/sting/queue/package-info.java similarity index 100% rename from public/java/src/org/broadinstitute/sting/queue/package-info.java rename to public/queue-framework/src/main/java/org/broadinstitute/sting/queue/package-info.java diff --git a/public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala b/public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala new file mode 100644 index 000000000..9bb031c38 --- /dev/null +++ b/public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala @@ -0,0 +1,532 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.queue.qscripts.CNV + +import org.broadinstitute.sting.queue.extensions.gatk._ +import org.broadinstitute.sting.queue.QScript +import org.broadinstitute.sting.queue.util.VCF_BAM_utilities +import org.broadinstitute.sting.queue.util.DoC._ +import org.broadinstitute.sting.commandline.Hidden +import java.io.{PrintStream, PrintWriter} +import org.broadinstitute.sting.utils.text.XReadLines +import collection.JavaConversions._ +import org.broadinstitute.sting.gatk.walkers.coverage.CoverageUtils + +class xhmmCNVpipeline extends QScript { + qscript => + + @Input(doc = "bam input, as .bam or as a list of files", shortName = "I", required = true) + var bams: File = _ + + @Input(doc = "gatk jar file", shortName = "J", required = true) + var gatkJarFile: File = _ + + @Input(doc = "xhmm executable file", shortName = "xhmmExec", required = true) + var xhmmExec: File = _ + + @Input(doc = "Plink/Seq executable file", shortName = "pseqExec", required = true) + var pseqExec: File = _ + + @Argument(doc = "Plink/Seq SEQDB file (Reference genome sequence)", shortName = "SEQDB", required = true) + var pseqSeqDB: String = _ + + @Input(shortName = "R", doc = "ref", required = true) + var referenceFile: File = _ + + @Input(shortName = "L", doc = "Intervals", required = false) + var intervals: File = _ + + @Argument(doc = "level of parallelism for BAM DoC. By default is set to 0 [no scattering].", shortName = "scatter", required = false) + var scatterCountInput = 0 + + @Argument(doc = "Samples to run together for DoC. By default is set to 1 [one job per sample].", shortName = "samplesPerJob", required = false) + var samplesPerJob = 1 + + @Output(doc = "Base name for files to output", shortName = "o", required = true) + var outputBase: File = _ + + @Hidden + @Argument(doc = "How should overlapping reads from the same fragment be handled?", shortName = "countType", required = false) + var countType = CoverageUtils.CountPileupType.COUNT_FRAGMENTS + + @Argument(doc = "Maximum depth (before GATK down-sampling kicks in...)", shortName = "MAX_DEPTH", required = false) + var MAX_DEPTH = 20000 + + @Hidden + @Argument(doc = "Number of read-depth bins", shortName = "NUM_BINS", required = false) + var NUM_BINS = 200 + + @Hidden + @Argument(doc = "Starting value of read-depth bins", shortName = "START_BIN", required = false) + var START_BIN = 1 + + @Argument(doc = "Minimum read mapping quality", shortName = "MMQ", required = false) + var minMappingQuality = 0 + + @Argument(doc = "Minimum base quality to be counted in depth", shortName = "MBQ", required = false) + var minBaseQuality = 0 + + @Argument(doc = "Memory (in GB) required for storing the whole matrix in memory", shortName = "wholeMatrixMemory", required = false) + var wholeMatrixMemory = -1 + + @Argument(shortName = "minTargGC", doc = "Exclude all targets with GC content less than this value", required = false) + var minTargGC : Double = 0.1 + + @Argument(shortName = "maxTargGC", doc = "Exclude all targets with GC content greater than this value", required = false) + var maxTargGC : Double = 0.9 + + @Argument(shortName = "minTargRepeats", doc = "Exclude all targets with % of repeat-masked bases less than this value", required = false) + var minTargRepeats : Double = 0.0 + + @Argument(shortName = "maxTargRepeats", doc = "Exclude all targets with % of repeat-masked bases greater than this value", required = false) + var maxTargRepeats : Double = 0.1 + + @Argument(shortName = "sampleIDsMap", doc = "File mapping BAM sample IDs to desired sample IDs", required = false) + var sampleIDsMap: String = "" + + @Argument(shortName = "sampleIDsMapFromColumn", doc = "Column number of OLD sample IDs to map", required = false) + var sampleIDsMapFromColumn = 1 + + @Argument(shortName = "sampleIDsMapToColumn", doc = "Column number of NEW sample IDs to map", required = false) + var sampleIDsMapToColumn = 2 + + @Argument(shortName = "rawFilters", doc = "xhmm command-line parameters to filter targets and samples from raw data", required = false) + var targetSampleFiltersString: String = "" + + @Argument(shortName = "PCAnormalize", doc = "xhmm command-line parameters to Normalize data using PCA information", required = false) + var PCAnormalizeMethodString: String = "" + + @Argument(shortName = "normalizedFilters", doc = "xhmm command-line parameters to filter targets and samples from PCA-normalized data", required = false) + var targetSampleNormalizedFiltersString: String = "" + + @Argument(shortName = "xhmmParams", doc = "xhmm model parameters file", required = true) + var xhmmParamsArg: File = _ + + @Argument(shortName = "discoverParams", doc = "xhmm command-line parameters for discovery step", required = false) + var discoverCommandLineParams: String = "" + + @Argument(shortName = "genotypeParams", doc = "xhmm command-line parameters for genotyping step", required = false) + var genotypeCommandLineParams: String = "" + + @Argument(shortName = "genotypeSubsegments", doc = "Should we also genotype all subsegments of the discovered CNV?", required = false) + var genotypeSubsegments: Boolean = false + + @Argument(shortName = "maxTargetsInSubsegment", doc = "If genotypeSubsegments, then only consider sub-segments consisting of this number of targets or fewer", required = false) + var maxTargetsInSubsegment = 30 + + @Argument(shortName = "subsegmentGenotypeThreshold", doc = "If genotypeSubsegments, this is the default genotype quality threshold for the sub-segments", required = false) + var subsegmentGenotypeThreshold = 20.0 + + @Argument(shortName = "longJobQueue", doc = "Job queue to run the 'long-running' commands", required = false) + var longJobQueue: String = "" + + + val PREPARED_TARGS_SUFFIX: String = ".merged.interval_list" + + val RD_OUTPUT_SUFFIX: String = ".RD.txt" + + val TARGS_GC_SUFFIX = ".locus_GC.txt" + val EXTREME_GC_TARGS_SUFFIX = ".extreme_gc_targets.txt" + + val TARGS_REPEAT_COMPLEXITY_SUFFIX = ".locus_complexity.txt" + val EXTREME_REPEAT_COMPLEXITY_SUFFIX = ".extreme_complexity_targets.txt" + + val FILTERED_TARGS_SUFFIX: String = ".filtered_targets.txt" + val FILTERED_SAMPS_SUFFIX: String = ".filtered_samples.txt" + + + trait WholeMatrixMemoryLimit extends CommandLineFunction { + // Since loading ALL of the data can take significant memory: + if (wholeMatrixMemory < 0) { + this.memoryLimit = 24 + } + else { + this.memoryLimit = wholeMatrixMemory + } + } + + trait LongRunTime extends CommandLineFunction { + if (longJobQueue != "") + this.jobQueue = longJobQueue + } + + def script = { + val prepTargets = new PrepareTargets(List(qscript.intervals), outputBase.getPath + PREPARED_TARGS_SUFFIX, xhmmExec, referenceFile) + add(prepTargets) + + trait CommandLineGATKArgs extends CommandLineGATK { + this.intervals :+= prepTargets.out + this.jarFile = qscript.gatkJarFile + this.reference_sequence = qscript.referenceFile + this.logging_level = "INFO" + } + + val sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = VCF_BAM_utilities.getMapOfBAMsForSample(VCF_BAM_utilities.parseBAMsInput(bams)) + val samples: List[String] = sampleToBams.keys.toList + Console.out.printf("Samples are %s%n", samples) + + val groups: List[Group] = buildDoCgroups(samples, sampleToBams, samplesPerJob, outputBase) + var docs: List[DoC] = List[DoC]() + for (group <- groups) { + Console.out.printf("Group is %s%n", group) + docs ::= new DoC(group.bams, group.DoC_output, countType, MAX_DEPTH, minMappingQuality, minBaseQuality, scatterCountInput, START_BIN, NUM_BINS, Nil) with CommandLineGATKArgs + } + addAll(docs) + + val mergeDepths = new MergeGATKdepths(docs.map(u => u.intervalSampleOut), outputBase.getPath + RD_OUTPUT_SUFFIX, "_mean_cvg", xhmmExec, sampleIDsMap, sampleIDsMapFromColumn, sampleIDsMapToColumn, None, false) with WholeMatrixMemoryLimit with LongRunTime + add(mergeDepths) + + var excludeTargets : List[File] = List[File]() + if (minTargGC > 0 || maxTargGC < 1) { + val calcGCcontents = new GCContentByInterval with CommandLineGATKArgs + calcGCcontents.out = outputBase.getPath + TARGS_GC_SUFFIX + add(calcGCcontents) + + val excludeTargetsBasedOnGC = new ExcludeTargetsBasedOnValue(calcGCcontents.out, EXTREME_GC_TARGS_SUFFIX, minTargGC, maxTargGC) + add(excludeTargetsBasedOnGC) + excludeTargets ::= excludeTargetsBasedOnGC.out + } + + class CalculateRepeatComplexity(outFile : String) extends CommandLineFunction { + @Input(doc="") + var intervals: File = prepTargets.out + + @Output(doc="") + var out : File = new File(outFile) + + val regFile : String = outputBase.getPath + ".targets.reg" + val locDB : String = outputBase.getPath + ".targets.LOCDB" + + val removeFiles = "rm -f " + regFile + " " + locDB + val createRegFile = "cat " + intervals + " | awk 'BEGIN{OFS=\"\\t\"; print \"#CHR\\tBP1\\tBP2\\tID\"} {split($1,a,\":\"); chr=a[1]; if (match(chr,\"chr\")==0) {chr=\"chr\"chr} split(a[2],b,\"-\"); bp1=b[1]; bp2=bp1; if (length(b) > 1) {bp2=b[2]} print chr,bp1,bp2,NR}' > " + regFile + val createLOCDB = pseqExec + " . loc-load --locdb " + locDB + " --file " + regFile + " --group targets --out " + locDB + ".loc-load" + val calcRepeatMaskedPercent = pseqExec + " . loc-stats --locdb " + locDB + " --group targets --seqdb " + pseqSeqDB + " --out " + locDB + ".loc-stats" + val extractRepeatMaskedPercent = "cat " + locDB + ".loc-stats.locstats | awk '{if (NR > 1) print $_}' | sort -k1 -g | awk '{print $10}' | paste " + intervals + " - | awk '{print $1\"\\t\"$2}' > " + out + + var command: String = + removeFiles + + " && " + createRegFile + + " && " + createLOCDB + + " && " + calcRepeatMaskedPercent + + " && " + extractRepeatMaskedPercent + + def commandLine = command + + override def description = "Calculate the percentage of each target that is repeat-masked in the reference sequence: " + command + } + + if (minTargRepeats > 0 || maxTargRepeats < 1) { + val calcRepeatComplexity = new CalculateRepeatComplexity(outputBase.getPath + TARGS_REPEAT_COMPLEXITY_SUFFIX) + add(calcRepeatComplexity) + + val excludeTargetsBasedOnRepeats = new ExcludeTargetsBasedOnValue(calcRepeatComplexity.out, EXTREME_REPEAT_COMPLEXITY_SUFFIX, minTargRepeats, maxTargRepeats) + add(excludeTargetsBasedOnRepeats) + excludeTargets ::= excludeTargetsBasedOnRepeats.out + } + + val filterCenterDepths = new FilterCenterRawMatrix(mergeDepths.mergedDoC, excludeTargets) + add(filterCenterDepths) + + val pca = new PCA(filterCenterDepths.filteredCentered) + add(pca) + + val normalize = new Normalize(pca) + add(normalize) + + val filterZscore = new FilterAndZscoreNormalized(normalize.normalized) + add(filterZscore) + + val filterOriginal = new FilterOriginalData(mergeDepths.mergedDoC, filterCenterDepths, filterZscore) + add(filterOriginal) + + val discover = new DiscoverCNVs(filterZscore.filteredZscored, filterOriginal.sameFiltered) + add(discover) + + val genotype = new GenotypeCNVs(filterZscore.filteredZscored, discover.xcnv, filterOriginal.sameFiltered) + add(genotype) + + if (genotypeSubsegments) { + val genotypeSegs = new GenotypeCNVandSubsegments(filterZscore.filteredZscored, discover.xcnv, filterOriginal.sameFiltered) + add(genotypeSegs) + } + } + + class ExcludeTargetsBasedOnValue(locus_valueIn : File, outSuffix : String, minVal : Double, maxVal : Double) extends InProcessFunction { + @Input(doc="") + var locus_value : File = locus_valueIn + + @Output(doc="") + var out : File = new File(outputBase.getPath + outSuffix) + + def run = { + var outWriter = new PrintWriter(new PrintStream(out)) + var elems = asScalaIterator(new XReadLines(locus_value)) + + while (elems.hasNext) { + val line = elems.next + val splitLine = line.split("\\s+") + val locus = splitLine(0) + val locValStr = splitLine(1) + try { + val locVal = locValStr.toDouble + if (locVal < minVal || locVal > maxVal) + outWriter.printf("%s%n", locus) + } + catch { + case nfe: NumberFormatException => println("Ignoring non-numeric value " + locValStr + " for locus " + locus) + case e: Exception => throw e + } + } + + outWriter.close + } + } + + class FilterCenterRawMatrix(inputParam: File, excludeTargetsIn : List[File]) extends CommandLineFunction with WholeMatrixMemoryLimit with LongRunTime { + @Input(doc = "") + val input = inputParam + + @Input(doc = "") + val excludeTargets = excludeTargetsIn + + @Output + val filteredCentered: File = new File(outputBase.getPath + ".filtered_centered" + RD_OUTPUT_SUFFIX) + @Output + val filteredTargets: File = new File(filteredCentered.getPath + FILTERED_TARGS_SUFFIX) + @Output + val filteredSamples: File = new File(filteredCentered.getPath + FILTERED_SAMPS_SUFFIX) + + var command: String = + xhmmExec + " --matrix" + + " -r " + input + + " --centerData --centerType target" + + " -o " + filteredCentered + + " --outputExcludedTargets " + filteredTargets + + " --outputExcludedSamples " + filteredSamples + command += excludeTargets.map(u => " --excludeTargets " + u).reduceLeft(_ + "" + _) + if (targetSampleFiltersString != "") + command += " " + targetSampleFiltersString + + def commandLine = command + + override def description = "Filters samples and targets and then mean-centers the targets: " + command + } + + class PCA(inputParam: File) extends CommandLineFunction with WholeMatrixMemoryLimit with LongRunTime { + @Input(doc = "") + val input = inputParam + + val PCAbase: String = outputBase.getPath + ".RD_PCA" + + @Output + val outPC: File = new File(PCAbase + ".PC.txt") + @Output + val outPC_SD: File = new File(PCAbase + ".PC_SD.txt") + @Output + val outPC_LOADINGS: File = new File(PCAbase + ".PC_LOADINGS.txt") + + var command: String = + xhmmExec + " --PCA" + + " -r " + input + + " --PCAfiles " + PCAbase + + def commandLine = command + + override def description = "Runs PCA on mean-centered data: " + command + } + + class Normalize(pca: PCA) extends CommandLineFunction with LongRunTime { + @Input(doc = "") + val input = pca.input + + @Input(doc = "") + val inPC = pca.outPC + + @Input(doc = "") + val inPC_SD = pca.outPC_SD + + @Input(doc = "") + val inPC_LOADINGS = pca.outPC_LOADINGS + + @Output + val normalized: File = new File(outputBase.getPath + ".PCA_normalized.txt") + + var command: String = + xhmmExec + " --normalize" + + " -r " + input + + " --PCAfiles " + pca.PCAbase + + " --normalizeOutput " + normalized + if (PCAnormalizeMethodString != "") + command += " " + PCAnormalizeMethodString + + def commandLine = command + + override def description = "Normalizes mean-centered data using PCA information: " + command + } + + class FilterAndZscoreNormalized(inputParam: File) extends CommandLineFunction with WholeMatrixMemoryLimit with LongRunTime { + @Input(doc = "") + val input = inputParam + + @Output + val filteredZscored: File = new File(outputBase.getPath + ".PCA_normalized.filtered.sample_zscores" + RD_OUTPUT_SUFFIX) + @Output + val filteredTargets: File = new File(filteredZscored.getPath + FILTERED_TARGS_SUFFIX) + @Output + val filteredSamples: File = new File(filteredZscored.getPath + FILTERED_SAMPS_SUFFIX) + + var command: String = + xhmmExec + " --matrix" + + " -r " + input + + " --centerData --centerType sample --zScoreData" + + " -o " + filteredZscored + + " --outputExcludedTargets " + filteredTargets + + " --outputExcludedSamples " + filteredSamples + if (targetSampleNormalizedFiltersString != "") + command += " " + targetSampleNormalizedFiltersString + + def commandLine = command + + override def description = "Filters and z-score centers (by sample) the PCA-normalized data: " + command + } + + class FilterOriginalData(inputParam: File, filt1: FilterCenterRawMatrix, filt2: FilterAndZscoreNormalized) extends CommandLineFunction with WholeMatrixMemoryLimit with LongRunTime { + @Input(doc = "") + val input = inputParam + + @Input(doc = "") + val targFilters: List[File] = List(filt1.filteredTargets, filt2.filteredTargets).map(u => new File(u)) + + @Input(doc = "") + val sampFilters: List[File] = List(filt1.filteredSamples, filt2.filteredSamples).map(u => new File(u)) + + @Output + val sameFiltered: File = new File(outputBase.getPath + ".same_filtered" + RD_OUTPUT_SUFFIX) + + var command: String = + xhmmExec + " --matrix" + + " -r " + input + + targFilters.map(u => " --excludeTargets " + u).reduceLeft(_ + "" + _) + + sampFilters.map(u => " --excludeSamples " + u).reduceLeft(_ + "" + _) + + " -o " + sameFiltered + + def commandLine = command + + override def description = "Filters original read-depth data to be the same as filtered, normalized data: " + command + } + + class DiscoverCNVs(inputParam: File, origRDParam: File) extends CommandLineFunction with LongRunTime { + @Input(doc = "") + val input = inputParam + + @Input(doc = "") + val xhmmParams = xhmmParamsArg + + @Input(doc = "") + val origRD = origRDParam + + @Output + val xcnv: File = new File(outputBase.getPath + ".xcnv") + + @Output + val aux_xcnv: File = new File(outputBase.getPath + ".aux_xcnv") + + val posteriorsBase = outputBase.getPath + + @Output + val dipPosteriors: File = new File(posteriorsBase + ".posteriors.DIP.txt") + + @Output + val delPosteriors: File = new File(posteriorsBase + ".posteriors.DEL.txt") + + @Output + val dupPosteriors: File = new File(posteriorsBase + ".posteriors.DUP.txt") + + var command: String = + xhmmExec + " --discover" + + " -p " + xhmmParams + + " -r " + input + + " -R " + origRD + + " -c " + xcnv + + " -a " + aux_xcnv + + " -s " + posteriorsBase + + " " + discoverCommandLineParams + + def commandLine = command + + override def description = "Discovers CNVs in normalized data: " + command + } + + abstract class BaseGenotypeCNVs(inputParam: File, xcnv: File, origRDParam: File) extends CommandLineFunction with LongRunTime { + @Input(doc = "") + val input = inputParam + + @Input(doc = "") + val xhmmParams = xhmmParamsArg + + @Input(doc = "") + val origRD = origRDParam + + @Input(doc = "") + val inXcnv = xcnv + + var command: String = + xhmmExec + " --genotype" + + " -p " + xhmmParams + + " -r " + input + + " -g " + inXcnv + + " -F " + referenceFile + + " -R " + origRD + + " " + genotypeCommandLineParams + } + + class GenotypeCNVs(inputParam: File, xcnv: File, origRDParam: File) extends BaseGenotypeCNVs(inputParam, xcnv, origRDParam) { + @Output + val vcf: File = new File(outputBase.getPath + ".vcf") + + command += + " -v " + vcf + + def commandLine = command + + override def description = "Genotypes discovered CNVs in all samples: " + command + } + + class GenotypeCNVandSubsegments(inputParam: File, xcnv: File, origRDParam: File) extends BaseGenotypeCNVs(inputParam, xcnv, origRDParam) { + @Output + val vcf: File = new File(outputBase.getPath + ".subsegments.vcf") + + command += + " -v " + vcf + + " --subsegments" + + " --maxTargetsInSubsegment " + maxTargetsInSubsegment + + " --genotypeQualThresholdWhenNoExact " + subsegmentGenotypeThreshold + + def commandLine = command + + override def description = "Genotypes discovered CNVs (and their sub-segments, of up to " + maxTargetsInSubsegment + " targets) in all samples: " + command + } +} diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala b/public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala similarity index 100% rename from public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala rename to public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/DevNullOutput.scala b/public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/examples/DevNullOutput.scala similarity index 100% rename from public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/DevNullOutput.scala rename to public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/examples/DevNullOutput.scala diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountLoci.scala b/public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountLoci.scala similarity index 100% rename from public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountLoci.scala rename to public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountLoci.scala diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountReads.scala b/public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountReads.scala similarity index 100% rename from public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountReads.scala rename to public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountReads.scala diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCustomWalker.scala b/public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/examples/ExampleCustomWalker.scala similarity index 100% rename from public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCustomWalker.scala rename to public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/examples/ExampleCustomWalker.scala diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleReadFilter.scala b/public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/examples/ExampleReadFilter.scala similarity index 100% rename from public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleReadFilter.scala rename to public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/examples/ExampleReadFilter.scala diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleRetryMemoryLimit.scala b/public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/examples/ExampleRetryMemoryLimit.scala similarity index 100% rename from public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleRetryMemoryLimit.scala rename to public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/examples/ExampleRetryMemoryLimit.scala diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala b/public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala similarity index 100% rename from public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala rename to public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala b/public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala similarity index 100% rename from public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala rename to public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/ChunkVCF.scala b/public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/lib/ChunkVCF.scala similarity index 100% rename from public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/ChunkVCF.scala rename to public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/lib/ChunkVCF.scala diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/Vcf2Table.q b/public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/lib/Vcf2Table.q similarity index 100% rename from public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/Vcf2Table.q rename to public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/lib/Vcf2Table.q diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala b/public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala similarity index 100% rename from public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala rename to public/queue-framework/src/main/qscripts/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala diff --git a/public/R/scripts/org/broadinstitute/sting/queue/util/queueJobReport.R b/public/queue-framework/src/main/resources/org/broadinstitute/sting/queue/util/queueJobReport.R similarity index 100% rename from public/R/scripts/org/broadinstitute/sting/queue/util/queueJobReport.R rename to public/queue-framework/src/main/resources/org/broadinstitute/sting/queue/util/queueJobReport.R diff --git a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/QCommandLine.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/QCommandLine.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/QCommandPlugin.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/QCommandPlugin.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/QCommandPlugin.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/QCommandPlugin.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/QException.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/QException.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/QException.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/QException.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/QScript.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/QScript.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/QScript.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/QScript.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/QScriptManager.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/QScriptManager.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/QScriptManager.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/QScriptManager.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/QSettings.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/QSettings.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/QSettings.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/CommandLineJobManager.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/CommandLineJobManager.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/CommandLineJobManager.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/CommandLineJobManager.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/CommandLineJobRunner.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/CommandLineJobRunner.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/CommandLineJobRunner.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/CommandLineJobRunner.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/CommandLinePluginManager.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/CommandLinePluginManager.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/CommandLinePluginManager.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/CommandLinePluginManager.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/FunctionEdge.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/FunctionEdge.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/FunctionEdge.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/FunctionEdge.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/InProcessJobManager.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/InProcessJobManager.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/InProcessJobManager.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/InProcessJobManager.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/InProcessRunner.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/InProcessRunner.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/InProcessRunner.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/InProcessRunner.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/JobManager.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/JobManager.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/JobManager.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/JobManager.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/JobRunInfo.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/JobRunInfo.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/JobRunInfo.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/JobRunInfo.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/JobRunner.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/JobRunner.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/JobRunner.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/JobRunner.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/MappingEdge.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/MappingEdge.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/MappingEdge.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/MappingEdge.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/QEdge.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/QEdge.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/QEdge.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/QEdge.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/QGraph.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/QGraph.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/QGraphSettings.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/QGraphSettings.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/QGraphSettings.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/QGraphSettings.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/QNode.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/QNode.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/QNode.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/QNode.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/QStatusMessenger.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/QStatusMessenger.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/QStatusMessenger.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/QStatusMessenger.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/RunnerStatus.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/RunnerStatus.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/RunnerStatus.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/RunnerStatus.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobManager.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobManager.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobManager.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobManager.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobRunner.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobRunner.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobRunner.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobRunner.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobManager.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobManager.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobManager.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobManager.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobManager.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobManager.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobManager.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobManager.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/pbsengine/PbsEngineJobManager.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/pbsengine/PbsEngineJobManager.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/pbsengine/PbsEngineJobManager.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/pbsengine/PbsEngineJobManager.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/pbsengine/PbsEngineJobRunner.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/pbsengine/PbsEngineJobRunner.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/pbsengine/PbsEngineJobRunner.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/pbsengine/PbsEngineJobRunner.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/shell/ShellJobManager.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/shell/ShellJobManager.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/shell/ShellJobManager.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/shell/ShellJobManager.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/shell/ShellJobRunner.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/shell/ShellJobRunner.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/engine/shell/ShellJobRunner.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/shell/ShellJobRunner.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/cancer/MuTect.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/cancer/MuTect.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/cancer/MuTect.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/cancer/MuTect.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/BamGatherFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/BamGatherFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/BamGatherFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/BamGatherFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/CatVariantsGatherer.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/CatVariantsGatherer.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/CatVariantsGatherer.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/CatVariantsGatherer.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/DistributedScatterFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/DistributedScatterFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/DistributedScatterFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/DistributedScatterFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/LocusScatterFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/LocusScatterFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/LocusScatterFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/LocusScatterFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ReadScatterFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/ReadScatterFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ReadScatterFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/ReadScatterFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/TaggedFile.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/TaggedFile.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/TaggedFile.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/TaggedFile.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/WriteFlankingIntervalsFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/WriteFlankingIntervalsFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/WriteFlankingIntervalsFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/gatk/WriteFlankingIntervalsFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/AddOrReplaceReadGroups.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/AddOrReplaceReadGroups.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/picard/AddOrReplaceReadGroups.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/AddOrReplaceReadGroups.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CalculateHsMetrics.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/CalculateHsMetrics.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CalculateHsMetrics.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/CalculateHsMetrics.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectGcBiasMetrics.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/CollectGcBiasMetrics.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectGcBiasMetrics.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/CollectGcBiasMetrics.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectMultipleMetrics.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/CollectMultipleMetrics.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectMultipleMetrics.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/CollectMultipleMetrics.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/FastqToSam.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/FastqToSam.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/picard/FastqToSam.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/FastqToSam.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MarkDuplicates.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/MarkDuplicates.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MarkDuplicates.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/MarkDuplicates.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MergeSamFiles.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/MergeSamFiles.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MergeSamFiles.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/MergeSamFiles.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/PicardBamFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/PicardBamFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/picard/PicardBamFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/PicardBamFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/PicardMetricsFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/PicardMetricsFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/picard/PicardMetricsFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/PicardMetricsFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ReorderSam.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/ReorderSam.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ReorderSam.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/ReorderSam.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/RevertSam.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/RevertSam.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/picard/RevertSam.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/RevertSam.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SamToFastq.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/SamToFastq.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SamToFastq.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/SamToFastq.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SortSam.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/SortSam.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SortSam.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/SortSam.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ValidateSamFile.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/ValidateSamFile.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ValidateSamFile.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/picard/ValidateSamFile.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsCommandLineFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsCommandLineFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsCommandLineFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsCommandLineFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsIndexFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsIndexFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsIndexFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsIndexFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsMergeFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsMergeFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsMergeFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsMergeFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/snpeff/SnpEff.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/snpeff/SnpEff.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/extensions/snpeff/SnpEff.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/extensions/snpeff/SnpEff.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/CommandLineFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/CommandLineFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/InProcessFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/InProcessFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/function/InProcessFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/InProcessFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/ListWriterFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/ListWriterFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/function/ListWriterFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/ListWriterFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/QFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/QFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/RetryMemoryLimit.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/RetryMemoryLimit.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/function/RetryMemoryLimit.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/RetryMemoryLimit.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/CloneFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/scattergather/CloneFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/function/scattergather/CloneFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/scattergather/CloneFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ConcatenateLogsFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/scattergather/ConcatenateLogsFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ConcatenateLogsFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/scattergather/ConcatenateLogsFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/GatherFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/scattergather/GatherFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/function/scattergather/GatherFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/scattergather/GatherFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/GathererFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/scattergather/GathererFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/function/scattergather/GathererFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/scattergather/GathererFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ScatterFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/scattergather/ScatterFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ScatterFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/scattergather/ScatterFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ScatterGatherableFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/scattergather/ScatterGatherableFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ScatterGatherableFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/scattergather/ScatterGatherableFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/SimpleTextGatherFunction.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/scattergather/SimpleTextGatherFunction.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/function/scattergather/SimpleTextGatherFunction.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/function/scattergather/SimpleTextGatherFunction.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/library/clf/vcf/VCFExtractIntervals.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/library/clf/vcf/VCFExtractIntervals.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/library/clf/vcf/VCFExtractIntervals.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/library/clf/vcf/VCFExtractIntervals.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/library/clf/vcf/VCFExtractSamples.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/library/clf/vcf/VCFExtractSamples.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/library/clf/vcf/VCFExtractSamples.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/library/clf/vcf/VCFExtractSamples.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/SortByRef.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/library/ipf/SortByRef.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/library/ipf/SortByRef.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/library/ipf/SortByRef.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSamples.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSamples.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSamples.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSamples.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSites.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSites.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSites.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSites.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFSimpleMerge.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/library/ipf/vcf/VCFSimpleMerge.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFSimpleMerge.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/library/ipf/vcf/VCFSimpleMerge.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/ClassFieldCache.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/ClassFieldCache.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/ClassFieldCache.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/ClassFieldCache.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/CollectionUtils.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/CollectionUtils.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/CollectionUtils.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/CollectionUtils.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/DoC/package.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/DoC/package.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/DoC/package.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/DoC/package.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/EmailMessage.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/EmailMessage.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/EmailMessage.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/EmailMessage.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/EmailSettings.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/EmailSettings.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/EmailSettings.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/EmailSettings.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/Logging.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/Logging.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/Logging.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/Logging.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/PrimitiveOptionConversions.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/PrimitiveOptionConversions.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/PrimitiveOptionConversions.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/PrimitiveOptionConversions.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/QJobReport.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/QJobReport.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/QJobsReporter.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/QJobsReporter.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/QJobsReporter.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/QJobsReporter.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/QScriptUtils.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/QScriptUtils.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/ReflectionUtils.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/ReflectionUtils.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/ReflectionUtils.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/ReflectionUtils.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/RemoteFile.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/RemoteFile.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/RemoteFile.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/RemoteFile.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/RemoteFileConverter.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/RemoteFileConverter.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/RemoteFileConverter.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/RemoteFileConverter.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/Retry.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/Retry.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/Retry.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/Retry.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/RetryException.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/RetryException.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/RetryException.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/RetryException.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/ShellUtils.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/ShellUtils.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/ShellUtils.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/ShellUtils.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/StringFileConversions.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/StringFileConversions.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/StringFileConversions.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/StringFileConversions.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/SystemUtils.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/SystemUtils.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/SystemUtils.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/SystemUtils.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/TextFormatUtils.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/TextFormatUtils.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/TextFormatUtils.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/TextFormatUtils.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/VCF_BAM_utilities.scala b/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/VCF_BAM_utilities.scala similarity index 100% rename from public/scala/src/org/broadinstitute/sting/queue/util/VCF_BAM_utilities.scala rename to public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/util/VCF_BAM_utilities.scala diff --git a/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala b/public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala similarity index 100% rename from public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala rename to public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala diff --git a/public/scala/test/org/broadinstitute/sting/queue/function/CommandLineFunctionUnitTest.scala b/public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/function/CommandLineFunctionUnitTest.scala similarity index 100% rename from public/scala/test/org/broadinstitute/sting/queue/function/CommandLineFunctionUnitTest.scala rename to public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/function/CommandLineFunctionUnitTest.scala diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala b/public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala similarity index 100% rename from public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala rename to public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTestEvalSpec.scala b/public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/PipelineTestEvalSpec.scala similarity index 100% rename from public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTestEvalSpec.scala rename to public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/PipelineTestEvalSpec.scala diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTestSpec.scala b/public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/PipelineTestSpec.scala similarity index 100% rename from public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTestSpec.scala rename to public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/PipelineTestSpec.scala diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/DevNullOutputPipelineTest.scala b/public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/examples/DevNullOutputPipelineTest.scala similarity index 100% rename from public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/DevNullOutputPipelineTest.scala rename to public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/examples/DevNullOutputPipelineTest.scala diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala b/public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala similarity index 100% rename from public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala rename to public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountReadsPipelineTest.scala b/public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountReadsPipelineTest.scala similarity index 100% rename from public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountReadsPipelineTest.scala rename to public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountReadsPipelineTest.scala diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleReadFilterPipelineTest.scala b/public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/examples/ExampleReadFilterPipelineTest.scala similarity index 100% rename from public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleReadFilterPipelineTest.scala rename to public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/examples/ExampleReadFilterPipelineTest.scala diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleRetryMemoryLimitPipelineTest.scala b/public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/examples/ExampleRetryMemoryLimitPipelineTest.scala similarity index 100% rename from public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleRetryMemoryLimitPipelineTest.scala rename to public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/examples/ExampleRetryMemoryLimitPipelineTest.scala diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala b/public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala similarity index 100% rename from public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala rename to public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala diff --git a/public/scala/test/org/broadinstitute/sting/queue/util/ShellUtilsUnitTest.scala b/public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/util/ShellUtilsUnitTest.scala similarity index 100% rename from public/scala/test/org/broadinstitute/sting/queue/util/ShellUtilsUnitTest.scala rename to public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/util/ShellUtilsUnitTest.scala diff --git a/public/scala/test/org/broadinstitute/sting/queue/util/StringFileConversionsUnitTest.scala b/public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/util/StringFileConversionsUnitTest.scala similarity index 100% rename from public/scala/test/org/broadinstitute/sting/queue/util/StringFileConversionsUnitTest.scala rename to public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/util/StringFileConversionsUnitTest.scala diff --git a/public/scala/test/org/broadinstitute/sting/queue/util/SystemUtilsUnitTest.scala b/public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/util/SystemUtilsUnitTest.scala similarity index 100% rename from public/scala/test/org/broadinstitute/sting/queue/util/SystemUtilsUnitTest.scala rename to public/queue-framework/src/test/scala/org/broadinstitute/sting/queue/util/SystemUtilsUnitTest.scala diff --git a/public/queue-package/pom.xml b/public/queue-package/pom.xml new file mode 100644 index 000000000..27b6fae6d --- /dev/null +++ b/public/queue-package/pom.xml @@ -0,0 +1,309 @@ + + + 4.0.0 + + + org.broadinstitute.sting + sting-aggregator + 2.8-SNAPSHOT + ../.. + + + queue-package + jar + Queue Package + + + ${project.basedir}/../.. + prepare-package + package + Queue + org.broadinstitute.sting.queue.QCommandLine + + + + + + ${project.groupId} + queue-framework + ${project.version} + + + ${project.groupId} + gatk-package + ${project.version} + + + + org.scala-lang + scala-library + + + org.scala-lang + scala-compiler + + + + net.sf + picard + + + + javax.mail + mail + + + + net.java.dev.jna + jna + + + + com.google.code.cofoja + cofoja + + + + net.sf.snpeff + snpeff + + + + ${project.groupId} + gatk-framework + ${project.version} + example-resources + tar.bz2 + + + ${project.groupId} + queue-framework + ${project.version} + example-resources + tar.bz2 + + + + ${project.groupId} + queue-framework + ${project.version} + test-jar + test + + + + ${project.groupId} + gatk-framework + ${project.version} + test-jar + test + + + + org.testng + testng + test + + + + com.google.caliper + caliper + test + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + + + unit-tests + + ${sting.serialunittests.skipped} + + org.broadinstitute.sting:.* + + + + + + + + org.apache.maven.plugins + maven-failsafe-plugin + + + integration-tests + + ${sting.serialintegrationtests.skipped} + + org.broadinstitute.sting:.* + + + + + pipeline-tests + + ${sting.serialpipelinetests.skipped} + + org.broadinstitute.sting:.* + + + + + large-scale-tests + + ${sting.seriallargescaletests.skipped} + + org.broadinstitute.sting:.* + + + + + knowledge-base-tests + + ${sting.serialknowledgebasetests.skipped} + + org.broadinstitute.sting:.* + + + + + + + + org.apache.maven.plugins + maven-dependency-plugin + + + unpack-direct-dependencies + ${sting.unpack.phase} + + + + + + org.apache.maven.plugins + maven-shade-plugin + + + sting-executable + ${sting.shade.phase} + + + + + + org.apache.maven.plugins + maven-assembly-plugin + + + binary-dist + ${sting.shade.phase} + + + + + + com.pyx4j + maven-junction-plugin + + + link-binary-jar + ${sting.shade.phase} + + + link-git-release + ${sting.shade.phase} + + + + + + org.apache.maven.plugins + maven-install-plugin + + + default-install + none + + + install-package + install + + + + + + + + + + private + + + ${basedir}/../../private/queue-private/pom.xml + + + + + ${project.groupId} + queue-private + ${project.version} + true + + + ${project.groupId} + queue-private + ${project.version} + test-jar + test + true + + + + + + + com.pyx4j + maven-junction-plugin + + + link-private-testdata + process-test-resources + + + unlink-private-testdata + clean + + + link-private-qscript + process-test-resources + + + unlink-private-qscript + clean + + + + + + + + packagetests-enabled + + + sting.packagetests.enabled + true + + + + none + none + + + + + diff --git a/public/queue-package/src/main/assembly/binary-dist.xml b/public/queue-package/src/main/assembly/binary-dist.xml new file mode 100644 index 000000000..6de236a56 --- /dev/null +++ b/public/queue-package/src/main/assembly/binary-dist.xml @@ -0,0 +1,23 @@ + + binary-dist + + tar.bz2 + + false + + + + org.broadinstitute.sting:queue-package + + ${sting.binary-dist.name}.${artifact.extension} + + + resources + true + + org.broadinstitute.sting:gatk-framework:tar.bz2:example-resources + org.broadinstitute.sting:queue-framework:tar.bz2:example-resources + + + + diff --git a/settings/repository/com.google.code.cofoja/cofoja-1.0-r139.jar b/public/repo/com/google/code/cofoja/cofoja/1.0-r139/cofoja-1.0-r139.jar similarity index 100% rename from settings/repository/com.google.code.cofoja/cofoja-1.0-r139.jar rename to public/repo/com/google/code/cofoja/cofoja/1.0-r139/cofoja-1.0-r139.jar diff --git a/public/repo/com/google/code/cofoja/cofoja/1.0-r139/cofoja-1.0-r139.pom b/public/repo/com/google/code/cofoja/cofoja/1.0-r139/cofoja-1.0-r139.pom new file mode 100644 index 000000000..5a6fb69b9 --- /dev/null +++ b/public/repo/com/google/code/cofoja/cofoja/1.0-r139/cofoja-1.0-r139.pom @@ -0,0 +1,9 @@ + + 4.0.0 + com.google.code.cofoja + cofoja + cofoja + 1.0-r139 + diff --git a/public/repo/net/sf/picard/1.107.1683/picard-1.107.1683.jar b/public/repo/net/sf/picard/1.107.1683/picard-1.107.1683.jar new file mode 100644 index 000000000..089b71385 Binary files /dev/null and b/public/repo/net/sf/picard/1.107.1683/picard-1.107.1683.jar differ diff --git a/public/repo/net/sf/picard/1.107.1683/picard-1.107.1683.pom b/public/repo/net/sf/picard/1.107.1683/picard-1.107.1683.pom new file mode 100644 index 000000000..fd8a61917 --- /dev/null +++ b/public/repo/net/sf/picard/1.107.1683/picard-1.107.1683.pom @@ -0,0 +1,44 @@ + + + 4.0.0 + net.sf + picard + 1.107.1683 + picard + + + net.sf + sam + 1.107.1683 + + + org.broadinstitute + variant + 1.107.1683 + + + org.broad + tribble + 1.107.1683 + + + + org.apache.ant + ant + 1.8.2 + + + org.apache.ant + ant-launcher + + + + + com.sun + tools.jar + 1.5 + system + ${java.home}/../lib/tools.jar + + + diff --git a/public/repo/net/sf/sam/1.107.1683/sam-1.107.1683.jar b/public/repo/net/sf/sam/1.107.1683/sam-1.107.1683.jar new file mode 100644 index 000000000..928838707 Binary files /dev/null and b/public/repo/net/sf/sam/1.107.1683/sam-1.107.1683.jar differ diff --git a/public/repo/net/sf/sam/1.107.1683/sam-1.107.1683.pom b/public/repo/net/sf/sam/1.107.1683/sam-1.107.1683.pom new file mode 100644 index 000000000..89114f546 --- /dev/null +++ b/public/repo/net/sf/sam/1.107.1683/sam-1.107.1683.pom @@ -0,0 +1,21 @@ + + + 4.0.0 + net.sf + sam + 1.107.1683 + sam-jdk + + + org.testng + testng + 5.5 + jdk15 + + + org.xerial.snappy + snappy-java + 1.0.3-rc3 + + + diff --git a/settings/repository/net.sf.snpeff/snpeff-2.0.5.jar b/public/repo/net/sf/snpeff/snpeff/2.0.5/snpeff-2.0.5.jar similarity index 100% rename from settings/repository/net.sf.snpeff/snpeff-2.0.5.jar rename to public/repo/net/sf/snpeff/snpeff/2.0.5/snpeff-2.0.5.jar diff --git a/public/repo/net/sf/snpeff/snpeff/2.0.5/snpeff-2.0.5.pom b/public/repo/net/sf/snpeff/snpeff/2.0.5/snpeff-2.0.5.pom new file mode 100644 index 000000000..d316e2055 --- /dev/null +++ b/public/repo/net/sf/snpeff/snpeff/2.0.5/snpeff-2.0.5.pom @@ -0,0 +1,9 @@ + + 4.0.0 + net.sf + snpeff + snpeff + 2.0.5 + diff --git a/public/repo/org/broad/tribble/1.107.1683/tribble-1.107.1683.jar b/public/repo/org/broad/tribble/1.107.1683/tribble-1.107.1683.jar new file mode 100644 index 000000000..efa04ad2c Binary files /dev/null and b/public/repo/org/broad/tribble/1.107.1683/tribble-1.107.1683.jar differ diff --git a/public/repo/org/broad/tribble/1.107.1683/tribble-1.107.1683.pom b/public/repo/org/broad/tribble/1.107.1683/tribble-1.107.1683.pom new file mode 100644 index 000000000..7bf169bd4 --- /dev/null +++ b/public/repo/org/broad/tribble/1.107.1683/tribble-1.107.1683.pom @@ -0,0 +1,15 @@ + + + 4.0.0 + org.broad + tribble + 1.107.1683 + tribble + + + net.sf + sam + 1.107.1683 + + + diff --git a/public/repo/org/broadinstitute/variant/1.107.1683/variant-1.107.1683.jar b/public/repo/org/broadinstitute/variant/1.107.1683/variant-1.107.1683.jar new file mode 100644 index 000000000..ea4ebe35e Binary files /dev/null and b/public/repo/org/broadinstitute/variant/1.107.1683/variant-1.107.1683.jar differ diff --git a/public/repo/org/broadinstitute/variant/1.107.1683/variant-1.107.1683.pom b/public/repo/org/broadinstitute/variant/1.107.1683/variant-1.107.1683.pom new file mode 100644 index 000000000..256963812 --- /dev/null +++ b/public/repo/org/broadinstitute/variant/1.107.1683/variant-1.107.1683.pom @@ -0,0 +1,31 @@ + + + 4.0.0 + org.broadinstitute + variant + 1.107.1683 + variant + + + org.broad + tribble + 1.107.1683 + + + net.sf + sam + 1.107.1683 + + + org.apache.commons + commons-jexl + 2.1.1 + + + + com.google.code.cofoja + cofoja + 1.0-r139 + + + diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala deleted file mode 100644 index 7dd771873..000000000 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala +++ /dev/null @@ -1,532 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.queue.qscripts.CNV - -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.QScript -import org.broadinstitute.sting.queue.util.VCF_BAM_utilities -import org.broadinstitute.sting.queue.util.DoC._ -import org.broadinstitute.sting.commandline.Hidden -import java.io.{PrintStream, PrintWriter} -import org.broadinstitute.sting.utils.text.XReadLines -import collection.JavaConversions._ -import org.broadinstitute.sting.gatk.walkers.coverage.CoverageUtils - -class xhmmCNVpipeline extends QScript { - qscript => - - @Input(doc = "bam input, as .bam or as a list of files", shortName = "I", required = true) - var bams: File = _ - - @Input(doc = "gatk jar file", shortName = "J", required = true) - var gatkJarFile: File = _ - - @Input(doc = "xhmm executable file", shortName = "xhmmExec", required = true) - var xhmmExec: File = _ - - @Input(doc = "Plink/Seq executable file", shortName = "pseqExec", required = true) - var pseqExec: File = _ - - @Argument(doc = "Plink/Seq SEQDB file (Reference genome sequence)", shortName = "SEQDB", required = true) - var pseqSeqDB: String = _ - - @Input(shortName = "R", doc = "ref", required = true) - var referenceFile: File = _ - - @Input(shortName = "L", doc = "Intervals", required = false) - var intervals: File = _ - - @Argument(doc = "level of parallelism for BAM DoC. By default is set to 0 [no scattering].", shortName = "scatter", required = false) - var scatterCountInput = 0 - - @Argument(doc = "Samples to run together for DoC. By default is set to 1 [one job per sample].", shortName = "samplesPerJob", required = false) - var samplesPerJob = 1 - - @Output(doc = "Base name for files to output", shortName = "o", required = true) - var outputBase: File = _ - - @Hidden - @Argument(doc = "How should overlapping reads from the same fragment be handled?", shortName = "countType", required = false) - var countType = CoverageUtils.CountPileupType.COUNT_FRAGMENTS - - @Argument(doc = "Maximum depth (before GATK down-sampling kicks in...)", shortName = "MAX_DEPTH", required = false) - var MAX_DEPTH = 20000 - - @Hidden - @Argument(doc = "Number of read-depth bins", shortName = "NUM_BINS", required = false) - var NUM_BINS = 200 - - @Hidden - @Argument(doc = "Starting value of read-depth bins", shortName = "START_BIN", required = false) - var START_BIN = 1 - - @Argument(doc = "Minimum read mapping quality", shortName = "MMQ", required = false) - var minMappingQuality = 0 - - @Argument(doc = "Minimum base quality to be counted in depth", shortName = "MBQ", required = false) - var minBaseQuality = 0 - - @Argument(doc = "Memory (in GB) required for storing the whole matrix in memory", shortName = "wholeMatrixMemory", required = false) - var wholeMatrixMemory = -1 - - @Argument(shortName = "minTargGC", doc = "Exclude all targets with GC content less than this value", required = false) - var minTargGC : Double = 0.1 - - @Argument(shortName = "maxTargGC", doc = "Exclude all targets with GC content greater than this value", required = false) - var maxTargGC : Double = 0.9 - - @Argument(shortName = "minTargRepeats", doc = "Exclude all targets with % of repeat-masked bases less than this value", required = false) - var minTargRepeats : Double = 0.0 - - @Argument(shortName = "maxTargRepeats", doc = "Exclude all targets with % of repeat-masked bases greater than this value", required = false) - var maxTargRepeats : Double = 0.1 - - @Argument(shortName = "sampleIDsMap", doc = "File mapping BAM sample IDs to desired sample IDs", required = false) - var sampleIDsMap: String = "" - - @Argument(shortName = "sampleIDsMapFromColumn", doc = "Column number of OLD sample IDs to map", required = false) - var sampleIDsMapFromColumn = 1 - - @Argument(shortName = "sampleIDsMapToColumn", doc = "Column number of NEW sample IDs to map", required = false) - var sampleIDsMapToColumn = 2 - - @Argument(shortName = "rawFilters", doc = "xhmm command-line parameters to filter targets and samples from raw data", required = false) - var targetSampleFiltersString: String = "" - - @Argument(shortName = "PCAnormalize", doc = "xhmm command-line parameters to Normalize data using PCA information", required = false) - var PCAnormalizeMethodString: String = "" - - @Argument(shortName = "normalizedFilters", doc = "xhmm command-line parameters to filter targets and samples from PCA-normalized data", required = false) - var targetSampleNormalizedFiltersString: String = "" - - @Argument(shortName = "xhmmParams", doc = "xhmm model parameters file", required = true) - var xhmmParamsArg: File = _ - - @Argument(shortName = "discoverParams", doc = "xhmm command-line parameters for discovery step", required = false) - var discoverCommandLineParams: String = "" - - @Argument(shortName = "genotypeParams", doc = "xhmm command-line parameters for genotyping step", required = false) - var genotypeCommandLineParams: String = "" - - @Argument(shortName = "genotypeSubsegments", doc = "Should we also genotype all subsegments of the discovered CNV?", required = false) - var genotypeSubsegments: Boolean = false - - @Argument(shortName = "maxTargetsInSubsegment", doc = "If genotypeSubsegments, then only consider sub-segments consisting of this number of targets or fewer", required = false) - var maxTargetsInSubsegment = 30 - - @Argument(shortName = "subsegmentGenotypeThreshold", doc = "If genotypeSubsegments, this is the default genotype quality threshold for the sub-segments", required = false) - var subsegmentGenotypeThreshold = 20.0 - - @Argument(shortName = "longJobQueue", doc = "Job queue to run the 'long-running' commands", required = false) - var longJobQueue: String = "" - - - val PREPARED_TARGS_SUFFIX: String = ".merged.interval_list" - - val RD_OUTPUT_SUFFIX: String = ".RD.txt" - - val TARGS_GC_SUFFIX = ".locus_GC.txt" - val EXTREME_GC_TARGS_SUFFIX = ".extreme_gc_targets.txt" - - val TARGS_REPEAT_COMPLEXITY_SUFFIX = ".locus_complexity.txt" - val EXTREME_REPEAT_COMPLEXITY_SUFFIX = ".extreme_complexity_targets.txt" - - val FILTERED_TARGS_SUFFIX: String = ".filtered_targets.txt" - val FILTERED_SAMPS_SUFFIX: String = ".filtered_samples.txt" - - - trait WholeMatrixMemoryLimit extends CommandLineFunction { - // Since loading ALL of the data can take significant memory: - if (wholeMatrixMemory < 0) { - this.memoryLimit = 24 - } - else { - this.memoryLimit = wholeMatrixMemory - } - } - - trait LongRunTime extends CommandLineFunction { - if (longJobQueue != "") - this.jobQueue = longJobQueue - } - - def script = { - val prepTargets = new PrepareTargets(List(qscript.intervals), outputBase.getPath + PREPARED_TARGS_SUFFIX, xhmmExec, referenceFile) - add(prepTargets) - - trait CommandLineGATKArgs extends CommandLineGATK { - this.intervals :+= prepTargets.out - this.jarFile = qscript.gatkJarFile - this.reference_sequence = qscript.referenceFile - this.logging_level = "INFO" - } - - val sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = VCF_BAM_utilities.getMapOfBAMsForSample(VCF_BAM_utilities.parseBAMsInput(bams)) - val samples: List[String] = sampleToBams.keys.toList - Console.out.printf("Samples are %s%n", samples) - - val groups: List[Group] = buildDoCgroups(samples, sampleToBams, samplesPerJob, outputBase) - var docs: List[DoC] = List[DoC]() - for (group <- groups) { - Console.out.printf("Group is %s%n", group) - docs ::= new DoC(group.bams, group.DoC_output, countType, MAX_DEPTH, minMappingQuality, minBaseQuality, scatterCountInput, START_BIN, NUM_BINS, Nil) with CommandLineGATKArgs - } - addAll(docs) - - val mergeDepths = new MergeGATKdepths(docs.map(u => u.intervalSampleOut), outputBase.getPath + RD_OUTPUT_SUFFIX, "_mean_cvg", xhmmExec, sampleIDsMap, sampleIDsMapFromColumn, sampleIDsMapToColumn, None, false) with WholeMatrixMemoryLimit - add(mergeDepths) - - var excludeTargets : List[File] = List[File]() - if (minTargGC > 0 || maxTargGC < 1) { - val calcGCcontents = new GCContentByInterval with CommandLineGATKArgs - calcGCcontents.out = outputBase.getPath + TARGS_GC_SUFFIX - add(calcGCcontents) - - val excludeTargetsBasedOnGC = new ExcludeTargetsBasedOnValue(calcGCcontents.out, EXTREME_GC_TARGS_SUFFIX, minTargGC, maxTargGC) - add(excludeTargetsBasedOnGC) - excludeTargets ::= excludeTargetsBasedOnGC.out - } - - class CalculateRepeatComplexity(outFile : String) extends CommandLineFunction { - @Input(doc="") - var intervals: File = prepTargets.out - - @Output(doc="") - var out : File = new File(outFile) - - val regFile : String = outputBase.getPath + ".targets.reg" - val locDB : String = outputBase.getPath + ".targets.LOCDB" - - val removeFiles = "rm -f " + regFile + " " + locDB - val createRegFile = "cat " + intervals + " | awk 'BEGIN{OFS=\"\\t\"; print \"#CHR\\tBP1\\tBP2\\tID\"} {split($1,a,\":\"); chr=a[1]; if (match(chr,\"chr\")==0) {chr=\"chr\"chr} split(a[2],b,\"-\"); bp1=b[1]; bp2=bp1; if (length(b) > 1) {bp2=b[2]} print chr,bp1,bp2,NR}' > " + regFile - val createLOCDB = pseqExec + " . loc-load --locdb " + locDB + " --file " + regFile + " --group targets --out " + locDB + ".loc-load" - val calcRepeatMaskedPercent = pseqExec + " . loc-stats --locdb " + locDB + " --group targets --seqdb " + pseqSeqDB + " --out " + locDB + ".loc-stats" - val extractRepeatMaskedPercent = "cat " + locDB + ".loc-stats.locstats | awk '{if (NR > 1) print $_}' | sort -k1 -g | awk '{print $10}' | paste " + intervals + " - | awk '{print $1\"\\t\"$2}' > " + out - - var command: String = - removeFiles + - " && " + createRegFile + - " && " + createLOCDB + - " && " + calcRepeatMaskedPercent + - " && " + extractRepeatMaskedPercent - - def commandLine = command - - override def description = "Calculate the percentage of each target that is repeat-masked in the reference sequence: " + command - } - - if (minTargRepeats > 0 || maxTargRepeats < 1) { - val calcRepeatComplexity = new CalculateRepeatComplexity(outputBase.getPath + TARGS_REPEAT_COMPLEXITY_SUFFIX) - add(calcRepeatComplexity) - - val excludeTargetsBasedOnRepeats = new ExcludeTargetsBasedOnValue(calcRepeatComplexity.out, EXTREME_REPEAT_COMPLEXITY_SUFFIX, minTargRepeats, maxTargRepeats) - add(excludeTargetsBasedOnRepeats) - excludeTargets ::= excludeTargetsBasedOnRepeats.out - } - - val filterCenterDepths = new FilterCenterRawMatrix(mergeDepths.mergedDoC, excludeTargets) - add(filterCenterDepths) - - val pca = new PCA(filterCenterDepths.filteredCentered) - add(pca) - - val normalize = new Normalize(pca) - add(normalize) - - val filterZscore = new FilterAndZscoreNormalized(normalize.normalized) - add(filterZscore) - - val filterOriginal = new FilterOriginalData(mergeDepths.mergedDoC, filterCenterDepths, filterZscore) - add(filterOriginal) - - val discover = new DiscoverCNVs(filterZscore.filteredZscored, filterOriginal.sameFiltered) - add(discover) - - val genotype = new GenotypeCNVs(filterZscore.filteredZscored, discover.xcnv, filterOriginal.sameFiltered) - add(genotype) - - if (genotypeSubsegments) { - val genotypeSegs = new GenotypeCNVandSubsegments(filterZscore.filteredZscored, discover.xcnv, filterOriginal.sameFiltered) - add(genotypeSegs) - } - } - - class ExcludeTargetsBasedOnValue(locus_valueIn : File, outSuffix : String, minVal : Double, maxVal : Double) extends InProcessFunction { - @Input(doc="") - var locus_value : File = locus_valueIn - - @Output(doc="") - var out : File = new File(outputBase.getPath + outSuffix) - - def run = { - var outWriter = new PrintWriter(new PrintStream(out)) - var elems = asScalaIterator(new XReadLines(locus_value)) - - while (elems.hasNext) { - val line = elems.next - val splitLine = line.split("\\s+") - val locus = splitLine(0) - val locValStr = splitLine(1) - try { - val locVal = locValStr.toDouble - if (locVal < minVal || locVal > maxVal) - outWriter.printf("%s%n", locus) - } - catch { - case nfe: NumberFormatException => println("Ignoring non-numeric value " + locValStr + " for locus " + locus) - case e: Exception => throw e - } - } - - outWriter.close - } - } - - class FilterCenterRawMatrix(inputParam: File, excludeTargetsIn : List[File]) extends CommandLineFunction with WholeMatrixMemoryLimit { - @Input(doc = "") - val input = inputParam - - @Input(doc = "") - val excludeTargets = excludeTargetsIn - - @Output - val filteredCentered: File = new File(outputBase.getPath + ".filtered_centered" + RD_OUTPUT_SUFFIX) - @Output - val filteredTargets: File = new File(filteredCentered.getPath + FILTERED_TARGS_SUFFIX) - @Output - val filteredSamples: File = new File(filteredCentered.getPath + FILTERED_SAMPS_SUFFIX) - - var command: String = - xhmmExec + " --matrix" + - " -r " + input + - " --centerData --centerType target" + - " -o " + filteredCentered + - " --outputExcludedTargets " + filteredTargets + - " --outputExcludedSamples " + filteredSamples - command += excludeTargets.map(u => " --excludeTargets " + u).reduceLeft(_ + "" + _) - if (targetSampleFiltersString != "") - command += " " + targetSampleFiltersString - - def commandLine = command - - override def description = "Filters samples and targets and then mean-centers the targets: " + command - } - - class PCA(inputParam: File) extends CommandLineFunction with WholeMatrixMemoryLimit { - @Input(doc = "") - val input = inputParam - - val PCAbase: String = outputBase.getPath + ".RD_PCA" - - @Output - val outPC: File = new File(PCAbase + ".PC.txt") - @Output - val outPC_SD: File = new File(PCAbase + ".PC_SD.txt") - @Output - val outPC_LOADINGS: File = new File(PCAbase + ".PC_LOADINGS.txt") - - var command: String = - xhmmExec + " --PCA" + - " -r " + input + - " --PCAfiles " + PCAbase - - def commandLine = command - - override def description = "Runs PCA on mean-centered data: " + command - } - - class Normalize(pca: PCA) extends CommandLineFunction { - @Input(doc = "") - val input = pca.input - - @Input(doc = "") - val inPC = pca.outPC - - @Input(doc = "") - val inPC_SD = pca.outPC_SD - - @Input(doc = "") - val inPC_LOADINGS = pca.outPC_LOADINGS - - @Output - val normalized: File = new File(outputBase.getPath + ".PCA_normalized.txt") - - var command: String = - xhmmExec + " --normalize" + - " -r " + input + - " --PCAfiles " + pca.PCAbase + - " --normalizeOutput " + normalized - if (PCAnormalizeMethodString != "") - command += " " + PCAnormalizeMethodString - - def commandLine = command - - override def description = "Normalizes mean-centered data using PCA information: " + command - } - - class FilterAndZscoreNormalized(inputParam: File) extends CommandLineFunction with WholeMatrixMemoryLimit { - @Input(doc = "") - val input = inputParam - - @Output - val filteredZscored: File = new File(outputBase.getPath + ".PCA_normalized.filtered.sample_zscores" + RD_OUTPUT_SUFFIX) - @Output - val filteredTargets: File = new File(filteredZscored.getPath + FILTERED_TARGS_SUFFIX) - @Output - val filteredSamples: File = new File(filteredZscored.getPath + FILTERED_SAMPS_SUFFIX) - - var command: String = - xhmmExec + " --matrix" + - " -r " + input + - " --centerData --centerType sample --zScoreData" + - " -o " + filteredZscored + - " --outputExcludedTargets " + filteredTargets + - " --outputExcludedSamples " + filteredSamples - if (targetSampleNormalizedFiltersString != "") - command += " " + targetSampleNormalizedFiltersString - - def commandLine = command - - override def description = "Filters and z-score centers (by sample) the PCA-normalized data: " + command - } - - class FilterOriginalData(inputParam: File, filt1: FilterCenterRawMatrix, filt2: FilterAndZscoreNormalized) extends CommandLineFunction with WholeMatrixMemoryLimit { - @Input(doc = "") - val input = inputParam - - @Input(doc = "") - val targFilters: List[File] = List(filt1.filteredTargets, filt2.filteredTargets).map(u => new File(u)) - - @Input(doc = "") - val sampFilters: List[File] = List(filt1.filteredSamples, filt2.filteredSamples).map(u => new File(u)) - - @Output - val sameFiltered: File = new File(outputBase.getPath + ".same_filtered" + RD_OUTPUT_SUFFIX) - - var command: String = - xhmmExec + " --matrix" + - " -r " + input + - targFilters.map(u => " --excludeTargets " + u).reduceLeft(_ + "" + _) + - sampFilters.map(u => " --excludeSamples " + u).reduceLeft(_ + "" + _) + - " -o " + sameFiltered - - def commandLine = command - - override def description = "Filters original read-depth data to be the same as filtered, normalized data: " + command - } - - class DiscoverCNVs(inputParam: File, origRDParam: File) extends CommandLineFunction with LongRunTime { - @Input(doc = "") - val input = inputParam - - @Input(doc = "") - val xhmmParams = xhmmParamsArg - - @Input(doc = "") - val origRD = origRDParam - - @Output - val xcnv: File = new File(outputBase.getPath + ".xcnv") - - @Output - val aux_xcnv: File = new File(outputBase.getPath + ".aux_xcnv") - - val posteriorsBase = outputBase.getPath - - @Output - val dipPosteriors: File = new File(posteriorsBase + ".posteriors.DIP.txt") - - @Output - val delPosteriors: File = new File(posteriorsBase + ".posteriors.DEL.txt") - - @Output - val dupPosteriors: File = new File(posteriorsBase + ".posteriors.DUP.txt") - - var command: String = - xhmmExec + " --discover" + - " -p " + xhmmParams + - " -r " + input + - " -R " + origRD + - " -c " + xcnv + - " -a " + aux_xcnv + - " -s " + posteriorsBase + - " " + discoverCommandLineParams - - def commandLine = command - - override def description = "Discovers CNVs in normalized data: " + command - } - - abstract class BaseGenotypeCNVs(inputParam: File, xcnv: File, origRDParam: File) extends CommandLineFunction with LongRunTime { - @Input(doc = "") - val input = inputParam - - @Input(doc = "") - val xhmmParams = xhmmParamsArg - - @Input(doc = "") - val origRD = origRDParam - - @Input(doc = "") - val inXcnv = xcnv - - var command: String = - xhmmExec + " --genotype" + - " -p " + xhmmParams + - " -r " + input + - " -g " + inXcnv + - " -F " + referenceFile + - " -R " + origRD + - " " + genotypeCommandLineParams - } - - class GenotypeCNVs(inputParam: File, xcnv: File, origRDParam: File) extends BaseGenotypeCNVs(inputParam, xcnv, origRDParam) { - @Output - val vcf: File = new File(outputBase.getPath + ".vcf") - - command += - " -v " + vcf - - def commandLine = command - - override def description = "Genotypes discovered CNVs in all samples: " + command - } - - class GenotypeCNVandSubsegments(inputParam: File, xcnv: File, origRDParam: File) extends BaseGenotypeCNVs(inputParam, xcnv, origRDParam) { - @Output - val vcf: File = new File(outputBase.getPath + ".subsegments.vcf") - - command += - " -v " + vcf + - " --subsegments" + - " --maxTargetsInSubsegment " + maxTargetsInSubsegment + - " --genotypeQualThresholdWhenNoExact " + subsegmentGenotypeThreshold - - def commandLine = command - - override def description = "Genotypes discovered CNVs (and their sub-segments, of up to " + maxTargetsInSubsegment + " targets) in all samples: " + command - } -} diff --git a/public/sting-root/pom.xml b/public/sting-root/pom.xml new file mode 100644 index 000000000..84edd9be5 --- /dev/null +++ b/public/sting-root/pom.xml @@ -0,0 +1,612 @@ + + + 4.0.0 + + + + org.broadinstitute.sting + sting-root + 2.8-SNAPSHOT + pom + Sting Root + + + 3.0.4 + + + + UTF-8 + ${sourceEncoding} + ${sourceEncoding} + 1.7 + 1.7 + yyyy/MM/dd HH:mm:ss + ${project.basedir}/../.. + true + ${sting.committests.skipped} + ${sting.committests.skipped} + ${sting.committests.skipped} + true + true + false + 1g + 4g + -Xmx${test.maxmemory} + + + 1.107.1683 + ${picard.public.version} + ${picard.public.version} + ${picard.public.version} + ${picard.public.version} + + + + + + + org.scala-lang + scala-compiler + 2.10.2 + + + org.scala-lang + scala-library + 2.10.2 + + + com.google.code.cofoja + cofoja + 1.0-r139 + + + net.sf + sam + ${sam.version} + + + org.testng + testng + + + + + net.sf + picard + ${picard.version} + + + org.broad + tribble + ${tribble.version} + + + org.broadinstitute + variant + ${variant.version} + + + log4j + log4j + 1.2.15 + + + com.sun.jdmk + jmxtools + + + javax.jms + jms + + + com.sun.jmx + jmxri + + + + + javax.mail + mail + 1.4.4 + + + colt + colt + 1.2.0 + + + it.unimi.dsi + fastutil + 6.5.3 + + + org.simpleframework + simple-xml + 2.0.4 + + + org.reflections + reflections + 0.9.8 + + + org.slf4j + slf4j-log4j12 + 1.6.1 + + + gov.nist.math + jama + 1.0.2 + + + net.sf.jgrapht + jgrapht + 0.8.3 + + + org.freemarker + freemarker + 2.3.18 + + + org.apache.commons + commons-email + 1.2 + + + org.apache.commons + commons-jexl + 2.1.1 + + + commons-lang + commons-lang + 2.5 + + + commons-logging + commons-logging + 1.1.1 + + + commons-io + commons-io + 2.1 + + + commons-collections + commons-collections + 3.2.1 + + + org.apache.commons + commons-math + 2.2 + + + net.java.dev.jna + jna + 3.2.7 + + + net.java.dev.jets3t + jets3t + 0.8.1 + + + us.levk + drmaa-gridengine + 6.2u5 + + + net.sf.snpeff + snpeff + 2.0.5 + + + org.mongodb + mongo-java-driver + 2.7.3 + + + com.google.code.gson + gson + 2.2.2 + + + org.apache.httpcomponents + httpclient + 4.1.1 + + + + + com.sun + tools + 1.4.2 + system + ${java.home}/../lib/tools.jar + + + + org.testng + testng + 6.8 + test + + + com.google.caliper + caliper + 0.5-rc1 + test + + + + com.google.guava + guava + + + + + + + + + + + + org.codehaus.mojo + exec-maven-plugin + 1.2.1 + + + com.lukegb.mojo + gitdescribe-maven-plugin + 2.0 + + + org.codehaus.mojo + build-helper-maven-plugin + 1.8 + + + org.apache.maven.plugins + maven-clean-plugin + 2.5 + + + org.apache.maven.plugins + maven-dependency-plugin + 2.8 + + + org.apache.maven.plugins + maven-resources-plugin + 2.6 + + + org.apache.maven.plugins + maven-javadoc-plugin + 2.9.1 + + + org.apache.maven.plugins + maven-compiler-plugin + 3.1 + + + org.scala-tools + maven-scala-plugin + 2.15.2 + + + -Xmx${scala.maxmemory} + + + + + org.apache.maven.plugins + maven-jar-plugin + 2.4 + + + org.apache.maven.plugins + maven-shade-plugin + 2.1 + + + org.apache.maven.plugins + maven-assembly-plugin + 2.4 + + + + + + org.apache.maven.plugins + maven-surefire-plugin + 2.16 + + + true + false + false + plain + ${test.args} + + + usedefaultlisteners + false + + + + + diff + true + ${java.io.tmpdir} + + + + + default-test + none + + + + unit-tests + + test + + + + ${sting.unittests.skipped} + + **/*UnitTest.class + + + + + + + + org.apache.maven.plugins + maven-failsafe-plugin + 2.16 + + + true + false + false + plain + ${test.args} + + + usedefaultlisteners + false + + + + + diff + true + + ${sting.pipelinetests.run} + ${java.io.tmpdir} + + + + + integration-tests + + integration-test + verify + + + + + ${sting.integrationtests.skipped} + + **/*IntegrationTest.class + + + + + pipeline-tests + + integration-test + verify + + + + + ${sting.pipelinetests.skipped} + + **/*PipelineTest.class + + + + + large-scale-tests + + integration-test + verify + + + + ${sting.largescaletests.skipped} + + **/*LargeScaleTest.class + + + + + knowledge-base-tests + + integration-test + verify + + + + ${sting.knowledgebasetests.skipped} + + **/*KnowledgeBaseTest.class + + + + + + + + com.google.code.sortpom + maven-sortpom-plugin + 2.2 + + false + custom_1 + \n + ${sourceEncoding} + true + scope + 4 + false + + + + + sort + + verify + + + + + + + com.pyx4j + maven-junction-plugin + 1.0.3 + + + org.apache.maven.plugins + maven-invoker-plugin + 1.8 + + + org.apache.maven.plugins + maven-install-plugin + 2.5 + + + org.apache.maven.plugins + maven-site-plugin + 3.3 + + + + + + + + com.lukegb.mojo + gitdescribe-maven-plugin + + + --long + + true + git.version + exported + + + + gitdescribe-initialize + + gitdescribe + + initialize + + + gitdescribe-presite + + gitdescribe + + pre-site + + + + + org.codehaus.mojo + build-helper-maven-plugin + + + + fix-version-initialize + + regex-property + + initialize + + build.version + ${git.version} + git- + + + + fix-version-pre-site + + regex-property + + pre-site + + build.version + ${git.version} + git- + + + + + + com.google.code.sortpom + maven-sortpom-plugin + + + default + + sort + + verify + + + + + + + + true + + + + + sting.public.repo.local + Sting Public Local Repository + file:${sting.basedir}/public/repo + + + + diff --git a/public/sting-utils/pom.xml b/public/sting-utils/pom.xml new file mode 100644 index 000000000..7d7ae87b4 --- /dev/null +++ b/public/sting-utils/pom.xml @@ -0,0 +1,159 @@ + + + 4.0.0 + + + org.broadinstitute.sting + sting-aggregator + 2.8-SNAPSHOT + ../.. + + + sting-utils + jar + Sting Utils + + + ${project.basedir}/../.. + org/broadinstitute/sting/utils/R + gsalib.tar.gz + + + + + net.sf + sam + + + net.sf + picard + + + org.broad + tribble + + + org.broadinstitute + variant + + + log4j + log4j + + + colt + colt + + + it.unimi.dsi + fastutil + + + org.simpleframework + simple-xml + + + org.reflections + reflections + + + org.slf4j + slf4j-log4j12 + + + org.freemarker + freemarker + + + org.apache.commons + commons-jexl + + + commons-lang + commons-lang + + + commons-io + commons-io + + + commons-collections + commons-collections + + + org.apache.commons + commons-math + + + net.java.dev.jna + jna + + + net.java.dev.jets3t + jets3t + + + us.levk + drmaa-gridengine + + + com.google.code.gson + gson + + + org.apache.httpcomponents + httpclient + + + com.google.code.cofoja + cofoja + + + ${project.groupId} + gsalib + ${project.version} + tar.gz + + + + org.testng + testng + test + + + com.google.caliper + caliper + test + + + + + + + org.apache.maven.plugins + maven-dependency-plugin + + + copy-gsalib + + copy + + process-resources + + + + ${project.groupId} + gsalib + ${project.version} + tar.gz + ${project.build.outputDirectory}/${gsalib.packagedir} + ${gsalib.filename} + + + + + + + + + diff --git a/public/sting-utils/src/main/config/org/broadinstitute/sting/utils/help/log4j.properties b/public/sting-utils/src/main/config/org/broadinstitute/sting/utils/help/log4j.properties new file mode 100644 index 000000000..38c8335c9 --- /dev/null +++ b/public/sting-utils/src/main/config/org/broadinstitute/sting/utils/help/log4j.properties @@ -0,0 +1,7 @@ +# Root logger option +log4j.rootLogger=INFO, stdout + +# Direct log messages to stdout +log4j.appender.stdout=org.apache.log4j.ConsoleAppender +log4j.appender.stdout.Target=System.out +log4j.appender.stdout.layout=org.apache.log4j.SimpleLayout diff --git a/settings/helpTemplates/common.html b/settings/helpTemplates/common.html index f4fb74af1..ff9df5eea 100644 --- a/settings/helpTemplates/common.html +++ b/settings/helpTemplates/common.html @@ -86,7 +86,13 @@ Support Forum

-

GATK version ${version} built at ${timestamp}.

+

GATK version ${version} built at ${timestamp}. + <#-- closing P tag in next macro --> + + + <#macro footerClose> + <#-- ugly little hack to enable adding tool-specific info inline --> +

<#macro pageFooter> diff --git a/settings/helpTemplates/generic.index.template.html b/settings/helpTemplates/generic.index.template.html index a5650d55e..0398b829d 100644 --- a/settings/helpTemplates/generic.index.template.html +++ b/settings/helpTemplates/generic.index.template.html @@ -58,7 +58,7 @@ ${version}
- <#assign seq = ["engine", "tools", "utilities", "other"]> + <#assign seq = ["engine", "tools", "utilities", "other", "dev"]> <#list seq as supercat>
<#list groups?sort_by("name") as group> @@ -70,4 +70,5 @@
<@footerInfo /> +<@footerClose /> <@pageFooter /> diff --git a/settings/helpTemplates/generic.template.html b/settings/helpTemplates/generic.template.html index eea741669..d4aa7c7f9 100644 --- a/settings/helpTemplates/generic.template.html +++ b/settings/helpTemplates/generic.template.html @@ -31,45 +31,70 @@ <#list myargs as arg> - ${arg.name} - ${arg.type} + ${arg.name}
+ <#if arg.synonyms??> + <#if arg.name[2..] != arg.synonyms[1..]> +  ${arg.synonyms} + + + + ${arg.defaultValue!"NA"} ${arg.summary} - <#-- - < - td>${arg.required} - --> <#macro argumentDetails arg> -

${arg.name} - <#if arg.synonyms??> / ${arg.synonyms} - - ( - <#if arg.attributes??>${arg.attributes} - ${arg.type} - <#if arg.defaultValue??> with default value ${arg.defaultValue} - ) -

-

- ${arg.summary}. ${arg.fulltext} - <#if arg.rodTypes??>${arg.name} binds reference ordered data. This argument supports ROD files of the - following types: ${arg.rodTypes} - - <#if arg.options??> -
- The ${arg.name} argument is an enumerated type (${arg.type}), which can have one of the following values: -

- <#list arg.options as option> -
${option.name}
-
${option.summary}
- -
- -

+
+

${arg.name} + <#if arg.synonyms??> / ${arg.synonyms} +

+

+ ${arg.summary}
+ ${arg.fulltext} +

+ + + <#if arg.rodTypes??> +

${arg.name} binds reference ordered data. This argument supports ROD files of the following types: ${arg.rodTypes}

+ + <#if arg.options??> +

+ The ${arg.name} argument is an enumerated type (${arg.type}), which can have one of the following values: +

+ <#list arg.options as option> +
${option.name}
+
${option.summary}
+ +
+

+ +

<#if arg.required??> + <#if arg.required == "yes"> + R + + + ${arg.type} + <#if arg.defaultValue??> +  ${arg.defaultValue} + + <#if arg.minValue??> +  [ [ ${arg.minValue} + + <#if arg.minRecValue??> +  [ ${arg.minRecValue} + + <#if arg.maxRecValue??> +  ${arg.maxRecValue} ] + + <#if arg.maxValue??> +  ${arg.maxValue} ] ] + +

<#macro relatedByType name type> <#list relatedDocs as relatedDoc> @@ -103,11 +128,12 @@

${name}

${summary}

+ <#-- using goto dev annotation instead, see above footer <#if author??>

Author ${author}

- + --> <#if group?? >

Category ${group} @@ -229,12 +255,12 @@ <#-- Create the argument summary --> <#if arguments.all?size != 0>

${name} specific arguments

-

This table summarizes the command-line arguments that are specific to this tool. For details, see the list further down below the table.

+

This table summarizes the command-line arguments that are specific to this tool. For more details on each argument, see the list further down below the table or click on an argument name to jump directly to that entry in the list.

- - + + @@ -267,6 +293,11 @@ <@argumentDetails arg=arg/> - + <@footerInfo /> + <#-- Specify go-to developer (for internal use) --> + <#if gotoDev??> + GTD: ${gotoDev} + + <@footerClose /> <@pageFooter /> \ No newline at end of file diff --git a/settings/ivysettings.properties b/settings/ivysettings.properties deleted file mode 100644 index 8a67434f9..000000000 --- a/settings/ivysettings.properties +++ /dev/null @@ -1 +0,0 @@ -repository.dir=${ivy.settings.dir}/repository diff --git a/settings/ivysettings.xml b/settings/ivysettings.xml deleted file mode 100644 index ce7667140..000000000 --- a/settings/ivysettings.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - diff --git a/settings/repository/com.google.code.caliper/caliper-1.0-SNAPSHOT.jar b/settings/repository/com.google.code.caliper/caliper-1.0-SNAPSHOT.jar deleted file mode 100644 index 3f7007457..000000000 Binary files a/settings/repository/com.google.code.caliper/caliper-1.0-SNAPSHOT.jar and /dev/null differ diff --git a/settings/repository/com.google.code.caliper/caliper-1.0-SNAPSHOT.xml b/settings/repository/com.google.code.caliper/caliper-1.0-SNAPSHOT.xml deleted file mode 100644 index 0e2b727e1..000000000 --- a/settings/repository/com.google.code.caliper/caliper-1.0-SNAPSHOT.xml +++ /dev/null @@ -1,3 +0,0 @@ - - - diff --git a/settings/repository/com.google.code.cofoja/cofoja-1.0-r139.xml b/settings/repository/com.google.code.cofoja/cofoja-1.0-r139.xml deleted file mode 100644 index 202d3d0a3..000000000 --- a/settings/repository/com.google.code.cofoja/cofoja-1.0-r139.xml +++ /dev/null @@ -1,3 +0,0 @@ - - - diff --git a/settings/repository/edu.mit.broad/picard-private-parts-2872.jar b/settings/repository/edu.mit.broad/picard-private-parts-2872.jar deleted file mode 100644 index b6e685684..000000000 Binary files a/settings/repository/edu.mit.broad/picard-private-parts-2872.jar and /dev/null differ diff --git a/settings/repository/edu.mit.broad/picard-private-parts-2872.xml b/settings/repository/edu.mit.broad/picard-private-parts-2872.xml deleted file mode 100644 index 677d27d80..000000000 --- a/settings/repository/edu.mit.broad/picard-private-parts-2872.xml +++ /dev/null @@ -1,3 +0,0 @@ - - - diff --git a/settings/repository/gov.nist/Jama-1.0.2.jar b/settings/repository/gov.nist/Jama-1.0.2.jar deleted file mode 100644 index 824d13338..000000000 Binary files a/settings/repository/gov.nist/Jama-1.0.2.jar and /dev/null differ diff --git a/settings/repository/gov.nist/Jama-1.0.2.xml b/settings/repository/gov.nist/Jama-1.0.2.xml deleted file mode 100644 index 57ea106ea..000000000 --- a/settings/repository/gov.nist/Jama-1.0.2.xml +++ /dev/null @@ -1,3 +0,0 @@ - - - diff --git a/settings/repository/net.sf.gridscheduler/drmaa-6.2u5p2-sources.jar b/settings/repository/net.sf.gridscheduler/drmaa-6.2u5p2-sources.jar deleted file mode 100644 index dc77c7d33..000000000 Binary files a/settings/repository/net.sf.gridscheduler/drmaa-6.2u5p2-sources.jar and /dev/null differ diff --git a/settings/repository/net.sf.gridscheduler/drmaa-6.2u5p2.jar b/settings/repository/net.sf.gridscheduler/drmaa-6.2u5p2.jar deleted file mode 100644 index f267be4b5..000000000 Binary files a/settings/repository/net.sf.gridscheduler/drmaa-6.2u5p2.jar and /dev/null differ diff --git a/settings/repository/net.sf.gridscheduler/drmaa-6.2u5p2.xml b/settings/repository/net.sf.gridscheduler/drmaa-6.2u5p2.xml deleted file mode 100644 index c6a8da052..000000000 --- a/settings/repository/net.sf.gridscheduler/drmaa-6.2u5p2.xml +++ /dev/null @@ -1,3 +0,0 @@ - - - diff --git a/settings/repository/net.sf.snpeff/snpeff-2.0.5.xml b/settings/repository/net.sf.snpeff/snpeff-2.0.5.xml deleted file mode 100644 index 9a622abe5..000000000 --- a/settings/repository/net.sf.snpeff/snpeff-2.0.5.xml +++ /dev/null @@ -1,3 +0,0 @@ - - - diff --git a/settings/repository/net.sf/picard-1.104.1628.jar b/settings/repository/net.sf/picard-1.104.1628.jar deleted file mode 100644 index 0a24f339e..000000000 Binary files a/settings/repository/net.sf/picard-1.104.1628.jar and /dev/null differ diff --git a/settings/repository/net.sf/picard-1.104.1628.xml b/settings/repository/net.sf/picard-1.104.1628.xml deleted file mode 100644 index bd0c9fbcd..000000000 --- a/settings/repository/net.sf/picard-1.104.1628.xml +++ /dev/null @@ -1,3 +0,0 @@ - - - diff --git a/settings/repository/net.sf/sam-1.104.1628.jar b/settings/repository/net.sf/sam-1.104.1628.jar deleted file mode 100644 index ad8437d03..000000000 Binary files a/settings/repository/net.sf/sam-1.104.1628.jar and /dev/null differ diff --git a/settings/repository/net.sf/sam-1.104.1628.xml b/settings/repository/net.sf/sam-1.104.1628.xml deleted file mode 100644 index 0b0cba93f..000000000 --- a/settings/repository/net.sf/sam-1.104.1628.xml +++ /dev/null @@ -1,3 +0,0 @@ - - - diff --git a/settings/repository/org.broad/tribble-1.104.1628.jar b/settings/repository/org.broad/tribble-1.104.1628.jar deleted file mode 100644 index 134be3d94..000000000 Binary files a/settings/repository/org.broad/tribble-1.104.1628.jar and /dev/null differ diff --git a/settings/repository/org.broad/tribble-1.104.1628.xml b/settings/repository/org.broad/tribble-1.104.1628.xml deleted file mode 100644 index c2158e974..000000000 --- a/settings/repository/org.broad/tribble-1.104.1628.xml +++ /dev/null @@ -1,3 +0,0 @@ - - - diff --git a/settings/repository/org.broadinstitute/variant-1.105.1642.jar b/settings/repository/org.broadinstitute/variant-1.105.1642.jar deleted file mode 100644 index c173d0fdd..000000000 Binary files a/settings/repository/org.broadinstitute/variant-1.105.1642.jar and /dev/null differ diff --git a/settings/repository/org.broadinstitute/variant-1.105.1642.xml b/settings/repository/org.broadinstitute/variant-1.105.1642.xml deleted file mode 100644 index d1846ce23..000000000 --- a/settings/repository/org.broadinstitute/variant-1.105.1642.xml +++ /dev/null @@ -1,3 +0,0 @@ - - -
NameTypeArgument name(s) Default value Summary